Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / pdf / document_loader.cc
blob868a8c4ce4f1367f6a011101efe453a8813c6557
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf {
17 namespace {
19 // Document below size will be downloaded in one chunk.
20 const uint32_t kMinFileSize = 64 * 1024;
22 // If the headers have a byte-range response, writes the start and end
23 // positions and returns true if at least the start position was parsed.
24 // The end position will be set to 0 if it was not found or parsed from the
25 // response.
26 // Returns false if not even a start position could be parsed.
27 bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) {
28 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
29 while (it.GetNext()) {
30 if (base::LowerCaseEqualsASCII(it.name(), "content-range")) {
31 std::string range = it.values().c_str();
32 if (base::StartsWith(range, "bytes",
33 base::CompareCase::INSENSITIVE_ASCII)) {
34 range = range.substr(strlen("bytes"));
35 std::string::size_type pos = range.find('-');
36 std::string range_end;
37 if (pos != std::string::npos)
38 range_end = range.substr(pos + 1);
39 TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
40 TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
41 *start = atoi(range.c_str());
42 *end = atoi(range_end.c_str());
43 return true;
47 return false;
50 // If the headers have a multi-part response, returns the boundary name.
51 // Otherwise returns an empty string.
52 std::string GetMultiPartBoundary(const std::string& headers) {
53 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
54 while (it.GetNext()) {
55 if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
56 std::string type = base::ToLowerASCII(it.values());
57 if (base::StartsWith(type, "multipart/", base::CompareCase::SENSITIVE)) {
58 const char* boundary = strstr(type.c_str(), "boundary=");
59 if (!boundary) {
60 NOTREACHED();
61 break;
64 return std::string(boundary + 9);
68 return std::string();
71 bool IsValidContentType(const std::string& type) {
72 return (base::EndsWith(type, "/pdf", base::CompareCase::INSENSITIVE_ASCII) ||
73 base::EndsWith(type, ".pdf", base::CompareCase::INSENSITIVE_ASCII) ||
74 base::EndsWith(type, "/x-pdf",
75 base::CompareCase::INSENSITIVE_ASCII) ||
76 base::EndsWith(type, "/*", base::CompareCase::INSENSITIVE_ASCII) ||
77 base::EndsWith(type, "/acrobat",
78 base::CompareCase::INSENSITIVE_ASCII) ||
79 base::EndsWith(type, "/unknown",
80 base::CompareCase::INSENSITIVE_ASCII));
83 } // namespace
85 DocumentLoader::Client::~Client() {
88 DocumentLoader::DocumentLoader(Client* client)
89 : client_(client), partial_document_(false), request_pending_(false),
90 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
91 document_size_(0), header_request_(true), is_multipart_(false) {
92 loader_factory_.Initialize(this);
95 DocumentLoader::~DocumentLoader() {
98 bool DocumentLoader::Init(const pp::URLLoader& loader,
99 const std::string& url,
100 const std::string& headers) {
101 DCHECK(url_.empty());
102 url_ = url;
103 loader_ = loader;
105 std::string response_headers;
106 if (!headers.empty()) {
107 response_headers = headers;
108 } else {
109 pp::URLResponseInfo response = loader_.GetResponseInfo();
110 pp::Var headers_var = response.GetHeaders();
112 if (headers_var.is_string()) {
113 response_headers = headers_var.AsString();
117 bool accept_ranges_bytes = false;
118 bool content_encoded = false;
119 uint32_t content_length = 0;
120 std::string type;
121 std::string disposition;
123 // This happens for PDFs not loaded from http(s) sources.
124 if (response_headers == "Content-Type: text/plain") {
125 if (!base::StartsWith(url, "http://",
126 base::CompareCase::INSENSITIVE_ASCII) &&
127 !base::StartsWith(url, "https://",
128 base::CompareCase::INSENSITIVE_ASCII)) {
129 type = "application/pdf";
132 if (type.empty() && !response_headers.empty()) {
133 net::HttpUtil::HeadersIterator it(response_headers.begin(),
134 response_headers.end(), "\n");
135 while (it.GetNext()) {
136 if (base::LowerCaseEqualsASCII(it.name(), "content-length")) {
137 content_length = atoi(it.values().c_str());
138 } else if (base::LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
139 accept_ranges_bytes = base::LowerCaseEqualsASCII(it.values(), "bytes");
140 } else if (base::LowerCaseEqualsASCII(it.name(), "content-encoding")) {
141 content_encoded = true;
142 } else if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
143 type = it.values();
144 size_t semi_colon_pos = type.find(';');
145 if (semi_colon_pos != std::string::npos) {
146 type = type.substr(0, semi_colon_pos);
148 TrimWhitespace(type, base::TRIM_ALL, &type);
149 } else if (base::LowerCaseEqualsASCII(it.name(), "content-disposition")) {
150 disposition = it.values();
154 if (!type.empty() && !IsValidContentType(type))
155 return false;
156 if (base::StartsWith(disposition, "attachment",
157 base::CompareCase::INSENSITIVE_ASCII))
158 return false;
160 if (content_length > 0)
161 chunk_stream_.Preallocate(content_length);
163 document_size_ = content_length;
164 requests_count_ = 0;
166 // Enable partial loading only if file size is above the threshold.
167 // It will allow avoiding latency for multiple requests.
168 if (content_length > kMinFileSize &&
169 accept_ranges_bytes &&
170 !content_encoded) {
171 LoadPartialDocument();
172 } else {
173 LoadFullDocument();
175 return true;
178 void DocumentLoader::LoadPartialDocument() {
179 partial_document_ = true;
180 // Force the main request to be cancelled, since if we're a full-frame plugin
181 // there could be other references to the loader.
182 loader_.Close();
183 loader_ = pp::URLLoader();
184 // Download file header.
185 header_request_ = true;
186 RequestData(0, std::min(GetRequestSize(), document_size_));
189 void DocumentLoader::LoadFullDocument() {
190 partial_document_ = false;
191 chunk_buffer_.clear();
192 ReadMore();
195 bool DocumentLoader::IsDocumentComplete() const {
196 if (document_size_ == 0) // Document size unknown.
197 return false;
198 return IsDataAvailable(0, document_size_);
201 uint32_t DocumentLoader::GetAvailableData() const {
202 if (document_size_ == 0) { // If document size is unknown.
203 return current_pos_;
206 std::vector<std::pair<size_t, size_t> > ranges;
207 chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
208 uint32_t available = document_size_;
209 for (const auto& range : ranges)
210 available -= range.second;
211 return available;
214 void DocumentLoader::ClearPendingRequests() {
215 // The first item in the queue is pending (need to keep it in the queue).
216 if (pending_requests_.size() > 1) {
217 // Remove all elements except the first one.
218 pending_requests_.erase(++pending_requests_.begin(),
219 pending_requests_.end());
223 bool DocumentLoader::GetBlock(uint32_t position,
224 uint32_t size,
225 void* buf) const {
226 return chunk_stream_.ReadData(position, size, buf);
229 bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const {
230 return chunk_stream_.IsRangeAvailable(position, size);
233 void DocumentLoader::RequestData(uint32_t position, uint32_t size) {
234 DCHECK(partial_document_);
236 // We have some artefact request from
237 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
238 // document is complete.
239 // We need this fix in PDFIum. Adding this as a work around.
240 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
241 // Test url:
242 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
243 if (IsDocumentComplete())
244 return;
246 pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
247 DownloadPendingRequests();
250 void DocumentLoader::DownloadPendingRequests() {
251 if (request_pending_ || pending_requests_.empty())
252 return;
254 // Remove already completed requests.
255 // By design DownloadPendingRequests() should have at least 1 request in the
256 // queue. ReadComplete() will remove the last pending comment from the queue.
257 while (pending_requests_.size() > 1) {
258 if (IsDataAvailable(pending_requests_.front().first,
259 pending_requests_.front().second)) {
260 pending_requests_.pop_front();
261 } else {
262 break;
266 uint32_t pos = pending_requests_.front().first;
267 uint32_t size = pending_requests_.front().second;
268 if (IsDataAvailable(pos, size)) {
269 ReadComplete();
270 return;
273 // If current request has been partially downloaded already, split it into
274 // a few smaller requests.
275 std::vector<std::pair<size_t, size_t> > ranges;
276 chunk_stream_.GetMissedRanges(pos, size, &ranges);
277 if (!ranges.empty()) {
278 pending_requests_.pop_front();
279 pending_requests_.insert(pending_requests_.begin(),
280 ranges.begin(), ranges.end());
281 pos = pending_requests_.front().first;
282 size = pending_requests_.front().second;
285 uint32_t cur_request_size = GetRequestSize();
286 // If size is less than default request, try to expand download range for
287 // more optimal download.
288 if (size < cur_request_size && partial_document_) {
289 // First, try to expand block towards the end of the file.
290 uint32_t new_pos = pos;
291 uint32_t new_size = cur_request_size;
292 if (pos + new_size > document_size_)
293 new_size = document_size_ - pos;
295 std::vector<std::pair<size_t, size_t> > ranges;
296 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
297 new_pos = ranges[0].first;
298 new_size = ranges[0].second;
301 // Second, try to expand block towards the beginning of the file.
302 if (new_size < cur_request_size) {
303 uint32_t block_end = new_pos + new_size;
304 if (block_end > cur_request_size) {
305 new_pos = block_end - cur_request_size;
306 } else {
307 new_pos = 0;
309 new_size = block_end - new_pos;
311 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
312 new_pos = ranges.back().first;
313 new_size = ranges.back().second;
316 pos = new_pos;
317 size = new_size;
320 size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
321 size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
322 if (pos - last_byte_before < cur_request_size) {
323 size = pos + size - last_byte_before;
324 pos = last_byte_before;
327 if ((pos + size < first_byte_after) &&
328 (pos + size + cur_request_size >= first_byte_after))
329 size = first_byte_after - pos;
331 request_pending_ = true;
333 // Start downloading first pending request.
334 loader_.Close();
335 loader_ = client_->CreateURLLoader();
336 pp::CompletionCallback callback =
337 loader_factory_.NewCallback(&DocumentLoader::DidOpen);
338 pp::URLRequestInfo request = GetRequest(pos, size);
339 requests_count_++;
340 int rv = loader_.Open(request, callback);
341 if (rv != PP_OK_COMPLETIONPENDING)
342 callback.Run(rv);
345 pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position,
346 uint32_t size) const {
347 pp::URLRequestInfo request(client_->GetPluginInstance());
348 request.SetURL(url_);
349 request.SetMethod("GET");
350 request.SetFollowRedirects(true);
351 request.SetCustomReferrerURL(url_);
353 const size_t kBufSize = 100;
354 char buf[kBufSize];
355 // According to rfc2616, byte range specifies position of the first and last
356 // bytes in the requested range inclusively. Therefore we should subtract 1
357 // from the position + size, to get index of the last byte that needs to be
358 // downloaded.
359 base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
360 position + size - 1);
361 pp::Var header(buf);
362 request.SetHeaders(header);
364 return request;
367 void DocumentLoader::DidOpen(int32_t result) {
368 if (result != PP_OK) {
369 NOTREACHED();
370 return;
373 int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
374 if (http_code >= 400 && http_code < 500) {
375 // Error accessing resource. 4xx error indicate subsequent requests
376 // will fail too.
377 // E.g. resource has been removed from the server while loading it.
378 // https://code.google.com/p/chromium/issues/detail?id=414827
379 return;
382 is_multipart_ = false;
383 current_chunk_size_ = 0;
384 current_chunk_read_ = 0;
386 pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
387 std::string headers;
388 if (headers_var.is_string())
389 headers = headers_var.AsString();
391 std::string boundary = GetMultiPartBoundary(headers);
392 if (!boundary.empty()) {
393 // Leave position untouched for now, when we read the data we'll get it.
394 is_multipart_ = true;
395 multipart_boundary_ = boundary;
396 } else {
397 // Need to make sure that the server returned a byte-range, since it's
398 // possible for a server to just ignore our bye-range request and just
399 // return the entire document even if it supports byte-range requests.
400 // i.e. sniff response to
401 // http://www.act.org/compass/sample/pdf/geometry.pdf
402 current_pos_ = 0;
403 uint32_t start_pos, end_pos;
404 if (GetByteRange(headers, &start_pos, &end_pos)) {
405 current_pos_ = start_pos;
406 if (end_pos && end_pos > start_pos)
407 current_chunk_size_ = end_pos - start_pos + 1;
411 ReadMore();
414 void DocumentLoader::ReadMore() {
415 pp::CompletionCallback callback =
416 loader_factory_.NewCallback(&DocumentLoader::DidRead);
417 int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
418 if (rv != PP_OK_COMPLETIONPENDING)
419 callback.Run(rv);
422 void DocumentLoader::DidRead(int32_t result) {
423 if (result > 0) {
424 char* start = buffer_;
425 size_t length = result;
426 if (is_multipart_ && result > 2) {
427 for (int i = 2; i < result; ++i) {
428 if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
429 (i >= 4 &&
430 buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
431 buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
432 uint32_t start_pos, end_pos;
433 if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
434 current_pos_ = start_pos;
435 start += i;
436 length -= i;
437 if (end_pos && end_pos > start_pos)
438 current_chunk_size_ = end_pos - start_pos + 1;
440 break;
444 // Reset this flag so we don't look inside the buffer in future calls of
445 // DidRead for this response. Note that this code DOES NOT handle multi-
446 // part responses with more than one part (we don't issue them at the
447 // moment, so they shouldn't arrive).
448 is_multipart_ = false;
451 if (current_chunk_size_ &&
452 current_chunk_read_ + length > current_chunk_size_)
453 length = current_chunk_size_ - current_chunk_read_;
455 if (length) {
456 if (document_size_ > 0) {
457 chunk_stream_.WriteData(current_pos_, start, length);
458 } else {
459 // If we did not get content-length in the response, we can't
460 // preallocate buffer for the entire document. Resizing array causing
461 // memory fragmentation issues on the large files and OOM exceptions.
462 // To fix this, we collect all chunks of the file to the list and
463 // concatenate them together after request is complete.
464 chunk_buffer_.push_back(std::vector<unsigned char>());
465 chunk_buffer_.back().resize(length);
466 memcpy(&(chunk_buffer_.back()[0]), start, length);
468 current_pos_ += length;
469 current_chunk_read_ += length;
470 client_->OnNewDataAvailable();
472 ReadMore();
473 } else if (result == PP_OK) {
474 ReadComplete();
475 } else {
476 NOTREACHED();
480 void DocumentLoader::ReadComplete() {
481 if (!partial_document_) {
482 if (document_size_ == 0) {
483 // For the document with no 'content-length" specified we've collected all
484 // the chunks already. Let's allocate final document buffer and copy them
485 // over.
486 chunk_stream_.Preallocate(current_pos_);
487 uint32_t pos = 0;
488 for (auto& chunk : chunk_buffer_) {
489 chunk_stream_.WriteData(pos, &(chunk[0]), chunk.size());
490 pos += chunk.size();
492 chunk_buffer_.clear();
494 document_size_ = current_pos_;
495 client_->OnDocumentComplete();
496 return;
499 request_pending_ = false;
500 pending_requests_.pop_front();
502 // If there are more pending request - continue downloading.
503 if (!pending_requests_.empty()) {
504 DownloadPendingRequests();
505 return;
508 if (IsDocumentComplete()) {
509 client_->OnDocumentComplete();
510 return;
513 if (header_request_)
514 client_->OnPartialDocumentLoaded();
515 else
516 client_->OnPendingRequestComplete();
517 header_request_ = false;
519 // The OnPendingRequestComplete could have added more requests.
520 if (!pending_requests_.empty()) {
521 DownloadPendingRequests();
522 } else {
523 // Document is not complete and we have no outstanding requests.
524 // Let's keep downloading PDF file in small chunks.
525 uint32_t pos = chunk_stream_.GetFirstMissingByte();
526 std::vector<std::pair<size_t, size_t> > ranges;
527 chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
528 DCHECK(!ranges.empty());
529 RequestData(ranges[0].first, ranges[0].second);
533 uint32_t DocumentLoader::GetRequestSize() const {
534 // Document loading strategy:
535 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
536 // double the size (64k), and so on, until we cap max request size at 2M for
537 // 71 or more requests.
538 uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u);
539 return 32 * 1024 * (1 << ((limited_count - 1) / 10u));
542 } // namespace chrome_pdf