Blink roll 25b6bd3a7a131ffe68d809546ad1a20707915cdc:3a503f41ae42e5b79cfcd2ff10e65afde...
[chromium-blink-merge.git] / pdf / document_loader.cc
blobb2628a62712ec383fb9fa38f872dc20312c4d349
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf {
17 // Document below size will be downloaded in one chunk.
18 const uint32 kMinFileSize = 64*1024;
20 DocumentLoader::DocumentLoader(Client* client)
21 : client_(client), partial_document_(false), request_pending_(false),
22 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23 document_size_(0), header_request_(true), is_multipart_(false) {
24 loader_factory_.Initialize(this);
27 DocumentLoader::~DocumentLoader() {
30 bool DocumentLoader::Init(const pp::URLLoader& loader,
31 const std::string& url,
32 const std::string& headers) {
33 DCHECK(url_.empty());
34 url_ = url;
35 loader_ = loader;
37 std::string response_headers;
38 if (!headers.empty()) {
39 response_headers = headers;
40 } else {
41 pp::URLResponseInfo response = loader_.GetResponseInfo();
42 pp::Var headers_var = response.GetHeaders();
44 if (headers_var.is_string()) {
45 response_headers = headers_var.AsString();
49 bool accept_ranges_bytes = false;
50 bool content_encoded = false;
51 uint32 content_length = 0;
52 std::string type;
53 std::string disposition;
54 if (!response_headers.empty()) {
55 net::HttpUtil::HeadersIterator it(response_headers.begin(),
56 response_headers.end(), "\n");
57 while (it.GetNext()) {
58 if (LowerCaseEqualsASCII(it.name(), "content-length")) {
59 content_length = atoi(it.values().c_str());
60 } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
61 accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
62 } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
63 content_encoded = true;
64 } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
65 type = it.values();
66 size_t semi_colon_pos = type.find(';');
67 if (semi_colon_pos != std::string::npos) {
68 type = type.substr(0, semi_colon_pos);
70 TrimWhitespace(type, base::TRIM_ALL, &type);
71 } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
72 disposition = it.values();
76 if (!type.empty() &&
77 !EndsWith(type, "/pdf", false) &&
78 !EndsWith(type, ".pdf", false) &&
79 !EndsWith(type, "/x-pdf", false) &&
80 !EndsWith(type, "/*", false) &&
81 !EndsWith(type, "/acrobat", false) &&
82 !EndsWith(type, "/unknown", false)) {
83 return false;
85 if (StartsWithASCII(disposition, "attachment", false)) {
86 return false;
89 if (content_length > 0)
90 chunk_stream_.Preallocate(content_length);
92 document_size_ = content_length;
93 requests_count_ = 0;
95 // Enable partial loading only if file size is above the threshold.
96 // It will allow avoiding latency for multiple requests.
97 if (content_length > kMinFileSize &&
98 accept_ranges_bytes &&
99 !content_encoded) {
100 LoadPartialDocument();
101 } else {
102 LoadFullDocument();
104 return true;
107 void DocumentLoader::LoadPartialDocument() {
108 partial_document_ = true;
109 // Force the main request to be cancelled, since if we're a full-frame plugin
110 // there could be other references to the loader.
111 loader_.Close();
112 loader_ = pp::URLLoader();
113 // Download file header.
114 header_request_ = true;
115 RequestData(0, std::min(GetRequestSize(), document_size_));
118 void DocumentLoader::LoadFullDocument() {
119 partial_document_ = false;
120 chunk_buffer_.clear();
121 ReadMore();
124 bool DocumentLoader::IsDocumentComplete() const {
125 if (document_size_ == 0) // Document size unknown.
126 return false;
127 return IsDataAvailable(0, document_size_);
130 uint32 DocumentLoader::GetAvailableData() const {
131 if (document_size_ == 0) { // If document size is unknown.
132 return current_pos_;
135 std::vector<std::pair<size_t, size_t> > ranges;
136 chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
137 uint32 available = document_size_;
138 std::vector<std::pair<size_t, size_t> >::iterator it;
139 for (it = ranges.begin(); it != ranges.end(); ++it) {
140 available -= it->second;
142 return available;
145 void DocumentLoader::ClearPendingRequests() {
146 // The first item in the queue is pending (need to keep it in the queue).
147 if (pending_requests_.size() > 1) {
148 // Remove all elements except the first one.
149 pending_requests_.erase(++pending_requests_.begin(),
150 pending_requests_.end());
154 bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
155 return chunk_stream_.ReadData(position, size, buf);
158 bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
159 return chunk_stream_.IsRangeAvailable(position, size);
162 void DocumentLoader::RequestData(uint32 position, uint32 size) {
163 DCHECK(partial_document_);
165 // We have some artefact request from
166 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
167 // document is complete.
168 // We need this fix in PDFIum. Adding this as a work around.
169 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
170 // Test url:
171 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
172 if (IsDocumentComplete())
173 return;
175 pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
176 DownloadPendingRequests();
179 void DocumentLoader::DownloadPendingRequests() {
180 if (request_pending_ || pending_requests_.empty())
181 return;
183 // Remove already completed requests.
184 // By design DownloadPendingRequests() should have at least 1 request in the
185 // queue. ReadComplete() will remove the last pending comment from the queue.
186 while (pending_requests_.size() > 1) {
187 if (IsDataAvailable(pending_requests_.front().first,
188 pending_requests_.front().second)) {
189 pending_requests_.pop_front();
190 } else {
191 break;
195 uint32 pos = pending_requests_.front().first;
196 uint32 size = pending_requests_.front().second;
197 if (IsDataAvailable(pos, size)) {
198 ReadComplete();
199 return;
202 // If current request has been partially downloaded already, split it into
203 // a few smaller requests.
204 std::vector<std::pair<size_t, size_t> > ranges;
205 chunk_stream_.GetMissedRanges(pos, size, &ranges);
206 if (ranges.size() > 0) {
207 pending_requests_.pop_front();
208 pending_requests_.insert(pending_requests_.begin(),
209 ranges.begin(), ranges.end());
210 pos = pending_requests_.front().first;
211 size = pending_requests_.front().second;
214 uint32 cur_request_size = GetRequestSize();
215 // If size is less than default request, try to expand download range for
216 // more optimal download.
217 if (size < cur_request_size && partial_document_) {
218 // First, try to expand block towards the end of the file.
219 uint32 new_pos = pos;
220 uint32 new_size = cur_request_size;
221 if (pos + new_size > document_size_)
222 new_size = document_size_ - pos;
224 std::vector<std::pair<size_t, size_t> > ranges;
225 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
226 new_pos = ranges[0].first;
227 new_size = ranges[0].second;
230 // Second, try to expand block towards the beginning of the file.
231 if (new_size < cur_request_size) {
232 uint32 block_end = new_pos + new_size;
233 if (block_end > cur_request_size) {
234 new_pos = block_end - cur_request_size;
235 } else {
236 new_pos = 0;
238 new_size = block_end - new_pos;
240 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
241 new_pos = ranges.back().first;
242 new_size = ranges.back().second;
245 pos = new_pos;
246 size = new_size;
249 size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
250 size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
251 if (pos - last_byte_before < cur_request_size) {
252 size = pos + size - last_byte_before;
253 pos = last_byte_before;
256 if ((pos + size < first_byte_after) &&
257 (pos + size + cur_request_size >= first_byte_after))
258 size = first_byte_after - pos;
260 request_pending_ = true;
262 // Start downloading first pending request.
263 loader_.Close();
264 loader_ = client_->CreateURLLoader();
265 pp::CompletionCallback callback =
266 loader_factory_.NewCallback(&DocumentLoader::DidOpen);
267 pp::URLRequestInfo request = GetRequest(pos, size);
268 requests_count_++;
269 int rv = loader_.Open(request, callback);
270 if (rv != PP_OK_COMPLETIONPENDING)
271 callback.Run(rv);
274 pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
275 uint32 size) const {
276 pp::URLRequestInfo request(client_->GetPluginInstance());
277 request.SetURL(url_.c_str());
278 request.SetMethod("GET");
279 request.SetFollowRedirects(true);
281 const size_t kBufSize = 100;
282 char buf[kBufSize];
283 // According to rfc2616, byte range specifies position of the first and last
284 // bytes in the requested range inclusively. Therefore we should subtract 1
285 // from the position + size, to get index of the last byte that needs to be
286 // downloaded.
287 base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
288 position + size - 1);
289 pp::Var header(buf);
290 request.SetHeaders(header);
292 return request;
295 void DocumentLoader::DidOpen(int32_t result) {
296 if (result != PP_OK) {
297 NOTREACHED();
298 return;
301 int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
302 if (http_code >= 400 && http_code < 500) {
303 // Error accessing resource. 4xx error indicate subsequent requests
304 // will fail too.
305 // E.g. resource has been removed from the server while loading it.
306 // https://code.google.com/p/chromium/issues/detail?id=414827
307 return;
310 is_multipart_ = false;
311 current_chunk_size_ = 0;
312 current_chunk_read_ = 0;
314 pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
315 std::string headers;
316 if (headers_var.is_string())
317 headers = headers_var.AsString();
319 std::string boundary = GetMultiPartBoundary(headers);
320 if (boundary.size()) {
321 // Leave position untouched for now, when we read the data we'll get it.
322 is_multipart_ = true;
323 multipart_boundary_ = boundary;
324 } else {
325 // Need to make sure that the server returned a byte-range, since it's
326 // possible for a server to just ignore our bye-range request and just
327 // return the entire document even if it supports byte-range requests.
328 // i.e. sniff response to
329 // http://www.act.org/compass/sample/pdf/geometry.pdf
330 current_pos_ = 0;
331 uint32 start_pos, end_pos;
332 if (GetByteRange(headers, &start_pos, &end_pos)) {
333 current_pos_ = start_pos;
334 if (end_pos && end_pos > start_pos)
335 current_chunk_size_ = end_pos - start_pos + 1;
339 ReadMore();
342 bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
343 uint32* end) {
344 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
345 while (it.GetNext()) {
346 if (LowerCaseEqualsASCII(it.name(), "content-range")) {
347 std::string range = it.values().c_str();
348 if (StartsWithASCII(range, "bytes", false)) {
349 range = range.substr(strlen("bytes"));
350 std::string::size_type pos = range.find('-');
351 std::string range_end;
352 if (pos != std::string::npos)
353 range_end = range.substr(pos + 1);
354 TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
355 TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
356 *start = atoi(range.c_str());
357 *end = atoi(range_end.c_str());
358 return true;
362 return false;
365 std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
366 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
367 while (it.GetNext()) {
368 if (LowerCaseEqualsASCII(it.name(), "content-type")) {
369 std::string type = base::StringToLowerASCII(it.values());
370 if (StartsWithASCII(type, "multipart/", true)) {
371 const char* boundary = strstr(type.c_str(), "boundary=");
372 if (!boundary) {
373 NOTREACHED();
374 break;
377 return std::string(boundary + 9);
381 return std::string();
384 void DocumentLoader::ReadMore() {
385 pp::CompletionCallback callback =
386 loader_factory_.NewCallback(&DocumentLoader::DidRead);
387 int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
388 if (rv != PP_OK_COMPLETIONPENDING)
389 callback.Run(rv);
392 void DocumentLoader::DidRead(int32_t result) {
393 if (result > 0) {
394 char* start = buffer_;
395 size_t length = result;
396 if (is_multipart_ && result > 2) {
397 for (int i = 2; i < result; ++i) {
398 if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
399 (i >= 4 &&
400 buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
401 buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
402 uint32 start_pos, end_pos;
403 if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
404 current_pos_ = start_pos;
405 start += i;
406 length -= i;
407 if (end_pos && end_pos > start_pos)
408 current_chunk_size_ = end_pos - start_pos + 1;
410 break;
414 // Reset this flag so we don't look inside the buffer in future calls of
415 // DidRead for this response. Note that this code DOES NOT handle multi-
416 // part responses with more than one part (we don't issue them at the
417 // moment, so they shouldn't arrive).
418 is_multipart_ = false;
421 if (current_chunk_size_ &&
422 current_chunk_read_ + length > current_chunk_size_)
423 length = current_chunk_size_ - current_chunk_read_;
425 if (length) {
426 if (document_size_ > 0) {
427 chunk_stream_.WriteData(current_pos_, start, length);
428 } else {
429 // If we did not get content-length in the response, we can't
430 // preallocate buffer for the entire document. Resizing array causing
431 // memory fragmentation issues on the large files and OOM exceptions.
432 // To fix this, we collect all chunks of the file to the list and
433 // concatenate them together after request is complete.
434 chunk_buffer_.push_back(std::vector<unsigned char>());
435 chunk_buffer_.back().resize(length);
436 memcpy(&(chunk_buffer_.back()[0]), start, length);
438 current_pos_ += length;
439 current_chunk_read_ += length;
440 client_->OnNewDataAvailable();
442 ReadMore();
443 } else if (result == PP_OK) {
444 ReadComplete();
445 } else {
446 NOTREACHED();
450 void DocumentLoader::ReadComplete() {
451 if (!partial_document_) {
452 if (document_size_ == 0) {
453 // For the document with no 'content-length" specified we've collected all
454 // the chunks already. Let's allocate final document buffer and copy them
455 // over.
456 chunk_stream_.Preallocate(current_pos_);
457 uint32 pos = 0;
458 std::list<std::vector<unsigned char> >::iterator it;
459 for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
460 chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
461 pos += it->size();
463 chunk_buffer_.clear();
465 document_size_ = current_pos_;
466 client_->OnDocumentComplete();
467 return;
470 request_pending_ = false;
471 pending_requests_.pop_front();
473 // If there are more pending request - continue downloading.
474 if (!pending_requests_.empty()) {
475 DownloadPendingRequests();
476 return;
479 if (IsDocumentComplete()) {
480 client_->OnDocumentComplete();
481 return;
484 if (header_request_)
485 client_->OnPartialDocumentLoaded();
486 else
487 client_->OnPendingRequestComplete();
488 header_request_ = false;
490 // The OnPendingRequestComplete could have added more requests.
491 if (!pending_requests_.empty()) {
492 DownloadPendingRequests();
493 } else {
494 // Document is not complete and we have no outstanding requests.
495 // Let's keep downloading PDF file in small chunks.
496 uint32 pos = chunk_stream_.GetFirstMissingByte();
497 std::vector<std::pair<size_t, size_t> > ranges;
498 chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
499 DCHECK(ranges.size() > 0);
500 RequestData(ranges[0].first, ranges[0].second);
504 uint32 DocumentLoader::GetRequestSize() const {
505 // Document loading strategy:
506 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
507 // double the size (64k), and so on, until we cap max request size at 2M for
508 // 71 or more requests.
509 uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
510 return 32*1024 * (1 << ((limited_count - 1) / 10u));
513 } // namespace chrome_pdf