1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf
{
19 // Document below size will be downloaded in one chunk.
20 const uint32_t kMinFileSize
= 64 * 1024;
22 // If the headers have a byte-range response, writes the start and end
23 // positions and returns true if at least the start position was parsed.
24 // The end position will be set to 0 if it was not found or parsed from the
26 // Returns false if not even a start position could be parsed.
27 bool GetByteRange(const std::string
& headers
, uint32_t* start
, uint32_t* end
) {
28 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
29 while (it
.GetNext()) {
30 if (LowerCaseEqualsASCII(it
.name(), "content-range")) {
31 std::string range
= it
.values().c_str();
32 if (StartsWithASCII(range
, "bytes", false)) {
33 range
= range
.substr(strlen("bytes"));
34 std::string::size_type pos
= range
.find('-');
35 std::string range_end
;
36 if (pos
!= std::string::npos
)
37 range_end
= range
.substr(pos
+ 1);
38 TrimWhitespaceASCII(range
, base::TRIM_LEADING
, &range
);
39 TrimWhitespaceASCII(range_end
, base::TRIM_LEADING
, &range_end
);
40 *start
= atoi(range
.c_str());
41 *end
= atoi(range_end
.c_str());
49 // If the headers have a multi-part response, returns the boundary name.
50 // Otherwise returns an empty string.
51 std::string
GetMultiPartBoundary(const std::string
& headers
) {
52 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
53 while (it
.GetNext()) {
54 if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
55 std::string type
= base::StringToLowerASCII(it
.values());
56 if (StartsWithASCII(type
, "multipart/", true)) {
57 const char* boundary
= strstr(type
.c_str(), "boundary=");
63 return std::string(boundary
+ 9);
70 bool IsValidContentType(const std::string
& type
) {
71 return (EndsWith(type
, "/pdf", false) ||
72 EndsWith(type
, ".pdf", false) ||
73 EndsWith(type
, "/x-pdf", false) ||
74 EndsWith(type
, "/*", false) ||
75 EndsWith(type
, "/acrobat", false) ||
76 EndsWith(type
, "/unknown", false));
81 DocumentLoader::Client::~Client() {
84 DocumentLoader::DocumentLoader(Client
* client
)
85 : client_(client
), partial_document_(false), request_pending_(false),
86 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
87 document_size_(0), header_request_(true), is_multipart_(false) {
88 loader_factory_
.Initialize(this);
91 DocumentLoader::~DocumentLoader() {
94 bool DocumentLoader::Init(const pp::URLLoader
& loader
,
95 const std::string
& url
,
96 const std::string
& headers
) {
101 std::string response_headers
;
102 if (!headers
.empty()) {
103 response_headers
= headers
;
105 pp::URLResponseInfo response
= loader_
.GetResponseInfo();
106 pp::Var headers_var
= response
.GetHeaders();
108 if (headers_var
.is_string()) {
109 response_headers
= headers_var
.AsString();
113 bool accept_ranges_bytes
= false;
114 bool content_encoded
= false;
115 uint32_t content_length
= 0;
117 std::string disposition
;
119 // This happens for PDFs not loaded from http(s) sources.
120 if (response_headers
== "Content-Type: text/plain") {
121 if (!StartsWithASCII(url
, "http://", false) &&
122 !StartsWithASCII(url
, "https://", false)) {
123 type
= "application/pdf";
126 if (type
.empty() && !response_headers
.empty()) {
127 net::HttpUtil::HeadersIterator
it(response_headers
.begin(),
128 response_headers
.end(), "\n");
129 while (it
.GetNext()) {
130 if (LowerCaseEqualsASCII(it
.name(), "content-length")) {
131 content_length
= atoi(it
.values().c_str());
132 } else if (LowerCaseEqualsASCII(it
.name(), "accept-ranges")) {
133 accept_ranges_bytes
= LowerCaseEqualsASCII(it
.values(), "bytes");
134 } else if (LowerCaseEqualsASCII(it
.name(), "content-encoding")) {
135 content_encoded
= true;
136 } else if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
138 size_t semi_colon_pos
= type
.find(';');
139 if (semi_colon_pos
!= std::string::npos
) {
140 type
= type
.substr(0, semi_colon_pos
);
142 TrimWhitespace(type
, base::TRIM_ALL
, &type
);
143 } else if (LowerCaseEqualsASCII(it
.name(), "content-disposition")) {
144 disposition
= it
.values();
148 if (!type
.empty() && !IsValidContentType(type
))
150 if (StartsWithASCII(disposition
, "attachment", false))
153 if (content_length
> 0)
154 chunk_stream_
.Preallocate(content_length
);
156 document_size_
= content_length
;
159 // Enable partial loading only if file size is above the threshold.
160 // It will allow avoiding latency for multiple requests.
161 if (content_length
> kMinFileSize
&&
162 accept_ranges_bytes
&&
164 LoadPartialDocument();
171 void DocumentLoader::LoadPartialDocument() {
172 partial_document_
= true;
173 // Force the main request to be cancelled, since if we're a full-frame plugin
174 // there could be other references to the loader.
176 loader_
= pp::URLLoader();
177 // Download file header.
178 header_request_
= true;
179 RequestData(0, std::min(GetRequestSize(), document_size_
));
182 void DocumentLoader::LoadFullDocument() {
183 partial_document_
= false;
184 chunk_buffer_
.clear();
188 bool DocumentLoader::IsDocumentComplete() const {
189 if (document_size_
== 0) // Document size unknown.
191 return IsDataAvailable(0, document_size_
);
194 uint32_t DocumentLoader::GetAvailableData() const {
195 if (document_size_
== 0) { // If document size is unknown.
199 std::vector
<std::pair
<size_t, size_t> > ranges
;
200 chunk_stream_
.GetMissedRanges(0, document_size_
, &ranges
);
201 uint32_t available
= document_size_
;
202 for (const auto& range
: ranges
)
203 available
-= range
.second
;
207 void DocumentLoader::ClearPendingRequests() {
208 // The first item in the queue is pending (need to keep it in the queue).
209 if (pending_requests_
.size() > 1) {
210 // Remove all elements except the first one.
211 pending_requests_
.erase(++pending_requests_
.begin(),
212 pending_requests_
.end());
216 bool DocumentLoader::GetBlock(uint32_t position
,
219 return chunk_stream_
.ReadData(position
, size
, buf
);
222 bool DocumentLoader::IsDataAvailable(uint32_t position
, uint32_t size
) const {
223 return chunk_stream_
.IsRangeAvailable(position
, size
);
226 void DocumentLoader::RequestData(uint32_t position
, uint32_t size
) {
227 DCHECK(partial_document_
);
229 // We have some artefact request from
230 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
231 // document is complete.
232 // We need this fix in PDFIum. Adding this as a work around.
233 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
235 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
236 if (IsDocumentComplete())
239 pending_requests_
.push_back(std::pair
<size_t, size_t>(position
, size
));
240 DownloadPendingRequests();
243 void DocumentLoader::DownloadPendingRequests() {
244 if (request_pending_
|| pending_requests_
.empty())
247 // Remove already completed requests.
248 // By design DownloadPendingRequests() should have at least 1 request in the
249 // queue. ReadComplete() will remove the last pending comment from the queue.
250 while (pending_requests_
.size() > 1) {
251 if (IsDataAvailable(pending_requests_
.front().first
,
252 pending_requests_
.front().second
)) {
253 pending_requests_
.pop_front();
259 uint32_t pos
= pending_requests_
.front().first
;
260 uint32_t size
= pending_requests_
.front().second
;
261 if (IsDataAvailable(pos
, size
)) {
266 // If current request has been partially downloaded already, split it into
267 // a few smaller requests.
268 std::vector
<std::pair
<size_t, size_t> > ranges
;
269 chunk_stream_
.GetMissedRanges(pos
, size
, &ranges
);
270 if (!ranges
.empty()) {
271 pending_requests_
.pop_front();
272 pending_requests_
.insert(pending_requests_
.begin(),
273 ranges
.begin(), ranges
.end());
274 pos
= pending_requests_
.front().first
;
275 size
= pending_requests_
.front().second
;
278 uint32_t cur_request_size
= GetRequestSize();
279 // If size is less than default request, try to expand download range for
280 // more optimal download.
281 if (size
< cur_request_size
&& partial_document_
) {
282 // First, try to expand block towards the end of the file.
283 uint32_t new_pos
= pos
;
284 uint32_t new_size
= cur_request_size
;
285 if (pos
+ new_size
> document_size_
)
286 new_size
= document_size_
- pos
;
288 std::vector
<std::pair
<size_t, size_t> > ranges
;
289 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
290 new_pos
= ranges
[0].first
;
291 new_size
= ranges
[0].second
;
294 // Second, try to expand block towards the beginning of the file.
295 if (new_size
< cur_request_size
) {
296 uint32_t block_end
= new_pos
+ new_size
;
297 if (block_end
> cur_request_size
) {
298 new_pos
= block_end
- cur_request_size
;
302 new_size
= block_end
- new_pos
;
304 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
305 new_pos
= ranges
.back().first
;
306 new_size
= ranges
.back().second
;
313 size_t last_byte_before
= chunk_stream_
.GetLastByteBefore(pos
);
314 size_t first_byte_after
= chunk_stream_
.GetFirstByteAfter(pos
+ size
- 1);
315 if (pos
- last_byte_before
< cur_request_size
) {
316 size
= pos
+ size
- last_byte_before
;
317 pos
= last_byte_before
;
320 if ((pos
+ size
< first_byte_after
) &&
321 (pos
+ size
+ cur_request_size
>= first_byte_after
))
322 size
= first_byte_after
- pos
;
324 request_pending_
= true;
326 // Start downloading first pending request.
328 loader_
= client_
->CreateURLLoader();
329 pp::CompletionCallback callback
=
330 loader_factory_
.NewCallback(&DocumentLoader::DidOpen
);
331 pp::URLRequestInfo request
= GetRequest(pos
, size
);
333 int rv
= loader_
.Open(request
, callback
);
334 if (rv
!= PP_OK_COMPLETIONPENDING
)
338 pp::URLRequestInfo
DocumentLoader::GetRequest(uint32_t position
,
339 uint32_t size
) const {
340 pp::URLRequestInfo
request(client_
->GetPluginInstance());
341 request
.SetURL(url_
);
342 request
.SetMethod("GET");
343 request
.SetFollowRedirects(true);
344 request
.SetCustomReferrerURL(url_
);
346 const size_t kBufSize
= 100;
348 // According to rfc2616, byte range specifies position of the first and last
349 // bytes in the requested range inclusively. Therefore we should subtract 1
350 // from the position + size, to get index of the last byte that needs to be
352 base::snprintf(buf
, kBufSize
, "Range: bytes=%d-%d", position
,
353 position
+ size
- 1);
355 request
.SetHeaders(header
);
360 void DocumentLoader::DidOpen(int32_t result
) {
361 if (result
!= PP_OK
) {
366 int32_t http_code
= loader_
.GetResponseInfo().GetStatusCode();
367 if (http_code
>= 400 && http_code
< 500) {
368 // Error accessing resource. 4xx error indicate subsequent requests
370 // E.g. resource has been removed from the server while loading it.
371 // https://code.google.com/p/chromium/issues/detail?id=414827
375 is_multipart_
= false;
376 current_chunk_size_
= 0;
377 current_chunk_read_
= 0;
379 pp::Var headers_var
= loader_
.GetResponseInfo().GetHeaders();
381 if (headers_var
.is_string())
382 headers
= headers_var
.AsString();
384 std::string boundary
= GetMultiPartBoundary(headers
);
385 if (!boundary
.empty()) {
386 // Leave position untouched for now, when we read the data we'll get it.
387 is_multipart_
= true;
388 multipart_boundary_
= boundary
;
390 // Need to make sure that the server returned a byte-range, since it's
391 // possible for a server to just ignore our bye-range request and just
392 // return the entire document even if it supports byte-range requests.
393 // i.e. sniff response to
394 // http://www.act.org/compass/sample/pdf/geometry.pdf
396 uint32_t start_pos
, end_pos
;
397 if (GetByteRange(headers
, &start_pos
, &end_pos
)) {
398 current_pos_
= start_pos
;
399 if (end_pos
&& end_pos
> start_pos
)
400 current_chunk_size_
= end_pos
- start_pos
+ 1;
407 void DocumentLoader::ReadMore() {
408 pp::CompletionCallback callback
=
409 loader_factory_
.NewCallback(&DocumentLoader::DidRead
);
410 int rv
= loader_
.ReadResponseBody(buffer_
, sizeof(buffer_
), callback
);
411 if (rv
!= PP_OK_COMPLETIONPENDING
)
415 void DocumentLoader::DidRead(int32_t result
) {
417 char* start
= buffer_
;
418 size_t length
= result
;
419 if (is_multipart_
&& result
> 2) {
420 for (int i
= 2; i
< result
; ++i
) {
421 if ((buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\n') ||
423 buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\r' &&
424 buffer_
[i
- 3] == '\n' && buffer_
[i
- 4] == '\r')) {
425 uint32_t start_pos
, end_pos
;
426 if (GetByteRange(std::string(buffer_
, i
), &start_pos
, &end_pos
)) {
427 current_pos_
= start_pos
;
430 if (end_pos
&& end_pos
> start_pos
)
431 current_chunk_size_
= end_pos
- start_pos
+ 1;
437 // Reset this flag so we don't look inside the buffer in future calls of
438 // DidRead for this response. Note that this code DOES NOT handle multi-
439 // part responses with more than one part (we don't issue them at the
440 // moment, so they shouldn't arrive).
441 is_multipart_
= false;
444 if (current_chunk_size_
&&
445 current_chunk_read_
+ length
> current_chunk_size_
)
446 length
= current_chunk_size_
- current_chunk_read_
;
449 if (document_size_
> 0) {
450 chunk_stream_
.WriteData(current_pos_
, start
, length
);
452 // If we did not get content-length in the response, we can't
453 // preallocate buffer for the entire document. Resizing array causing
454 // memory fragmentation issues on the large files and OOM exceptions.
455 // To fix this, we collect all chunks of the file to the list and
456 // concatenate them together after request is complete.
457 chunk_buffer_
.push_back(std::vector
<unsigned char>());
458 chunk_buffer_
.back().resize(length
);
459 memcpy(&(chunk_buffer_
.back()[0]), start
, length
);
461 current_pos_
+= length
;
462 current_chunk_read_
+= length
;
463 client_
->OnNewDataAvailable();
466 } else if (result
== PP_OK
) {
473 void DocumentLoader::ReadComplete() {
474 if (!partial_document_
) {
475 if (document_size_
== 0) {
476 // For the document with no 'content-length" specified we've collected all
477 // the chunks already. Let's allocate final document buffer and copy them
479 chunk_stream_
.Preallocate(current_pos_
);
481 for (auto& chunk
: chunk_buffer_
) {
482 chunk_stream_
.WriteData(pos
, &(chunk
[0]), chunk
.size());
485 chunk_buffer_
.clear();
487 document_size_
= current_pos_
;
488 client_
->OnDocumentComplete();
492 request_pending_
= false;
493 pending_requests_
.pop_front();
495 // If there are more pending request - continue downloading.
496 if (!pending_requests_
.empty()) {
497 DownloadPendingRequests();
501 if (IsDocumentComplete()) {
502 client_
->OnDocumentComplete();
507 client_
->OnPartialDocumentLoaded();
509 client_
->OnPendingRequestComplete();
510 header_request_
= false;
512 // The OnPendingRequestComplete could have added more requests.
513 if (!pending_requests_
.empty()) {
514 DownloadPendingRequests();
516 // Document is not complete and we have no outstanding requests.
517 // Let's keep downloading PDF file in small chunks.
518 uint32_t pos
= chunk_stream_
.GetFirstMissingByte();
519 std::vector
<std::pair
<size_t, size_t> > ranges
;
520 chunk_stream_
.GetMissedRanges(pos
, GetRequestSize(), &ranges
);
521 DCHECK(!ranges
.empty());
522 RequestData(ranges
[0].first
, ranges
[0].second
);
526 uint32_t DocumentLoader::GetRequestSize() const {
527 // Document loading strategy:
528 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
529 // double the size (64k), and so on, until we cap max request size at 2M for
530 // 71 or more requests.
531 uint32_t limited_count
= std::min(std::max(requests_count_
, 10u), 70u);
532 return 32 * 1024 * (1 << ((limited_count
- 1) / 10u));
535 } // namespace chrome_pdf