1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf
{
17 // Document below size will be downloaded in one chunk.
18 const uint32 kMinFileSize
= 64*1024;
20 DocumentLoader::DocumentLoader(Client
* client
)
21 : client_(client
), partial_document_(false), request_pending_(false),
22 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23 document_size_(0), header_request_(true), is_multipart_(false) {
24 loader_factory_
.Initialize(this);
27 DocumentLoader::~DocumentLoader() {
30 bool DocumentLoader::Init(const pp::URLLoader
& loader
,
31 const std::string
& url
,
32 const std::string
& headers
) {
37 std::string response_headers
;
38 if (!headers
.empty()) {
39 response_headers
= headers
;
41 pp::URLResponseInfo response
= loader_
.GetResponseInfo();
42 pp::Var headers_var
= response
.GetHeaders();
44 if (headers_var
.is_string()) {
45 response_headers
= headers_var
.AsString();
49 bool accept_ranges_bytes
= false;
50 bool content_encoded
= false;
51 uint32 content_length
= 0;
53 std::string disposition
;
54 if (!response_headers
.empty()) {
55 net::HttpUtil::HeadersIterator
it(response_headers
.begin(),
56 response_headers
.end(), "\n");
57 while (it
.GetNext()) {
58 if (LowerCaseEqualsASCII(it
.name(), "content-length")) {
59 content_length
= atoi(it
.values().c_str());
60 } else if (LowerCaseEqualsASCII(it
.name(), "accept-ranges")) {
61 accept_ranges_bytes
= LowerCaseEqualsASCII(it
.values(), "bytes");
62 } else if (LowerCaseEqualsASCII(it
.name(), "content-encoding")) {
63 content_encoded
= true;
64 } else if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
66 size_t semi_colon_pos
= type
.find(';');
67 if (semi_colon_pos
!= std::string::npos
) {
68 type
= type
.substr(0, semi_colon_pos
);
70 TrimWhitespace(type
, base::TRIM_ALL
, &type
);
71 } else if (LowerCaseEqualsASCII(it
.name(), "content-disposition")) {
72 disposition
= it
.values();
77 !EndsWith(type
, "/pdf", false) &&
78 !EndsWith(type
, ".pdf", false) &&
79 !EndsWith(type
, "/x-pdf", false) &&
80 !EndsWith(type
, "/*", false) &&
81 !EndsWith(type
, "/acrobat", false) &&
82 !EndsWith(type
, "/unknown", false) &&
83 !StartsWithASCII(url
, "blob:", false)) {
86 if (StartsWithASCII(disposition
, "attachment", false)) {
90 if (content_length
> 0)
91 chunk_stream_
.Preallocate(content_length
);
93 document_size_
= content_length
;
96 // Document loading strategy.
97 // Following table shows the growth on the minimal request size depending
98 // on the number requests that has been made already.
99 chunk_size_table_
[10] = 32*1024;
100 chunk_size_table_
[20] = 64*1024;
101 chunk_size_table_
[30] = 128*1024;
102 chunk_size_table_
[40] = 256*1024;
103 chunk_size_table_
[50] = 512*1024;
104 chunk_size_table_
[60] = 1024*1024;
105 chunk_size_table_
[70] = 2048*1024;
107 // Enable partial loading only if file size is above the threshold.
108 // It will allow avoiding latency for multiple requests.
109 if (content_length
> kMinFileSize
&&
110 accept_ranges_bytes
&&
112 LoadPartialDocument();
119 void DocumentLoader::LoadPartialDocument() {
120 partial_document_
= true;
121 // Force the main request to be cancelled, since if we're a full-frame plugin
122 // there could be other references to the loader.
124 loader_
= pp::URLLoader();
125 // Download file header.
126 header_request_
= true;
127 RequestData(0, std::min(GetRequestSize(), document_size_
));
130 void DocumentLoader::LoadFullDocument() {
131 partial_document_
= false;
132 chunk_buffer_
.clear();
136 bool DocumentLoader::IsDocumentComplete() const {
137 if (document_size_
== 0) // Document size unknown.
139 return IsDataAvailable(0, document_size_
);
142 uint32
DocumentLoader::GetAvailableData() const {
143 if (document_size_
== 0) { // If document size is unknown.
147 std::vector
<std::pair
<size_t, size_t> > ranges
;
148 chunk_stream_
.GetMissedRanges(0, document_size_
, &ranges
);
149 uint32 available
= document_size_
;
150 std::vector
<std::pair
<size_t, size_t> >::iterator it
;
151 for (it
= ranges
.begin(); it
!= ranges
.end(); ++it
) {
152 available
-= it
->second
;
157 void DocumentLoader::ClearPendingRequests() {
158 // The first item in the queue is pending (need to keep it in the queue).
159 if (pending_requests_
.size() > 1) {
160 // Remove all elements except the first one.
161 pending_requests_
.erase(++pending_requests_
.begin(),
162 pending_requests_
.end());
166 bool DocumentLoader::GetBlock(uint32 position
, uint32 size
, void* buf
) const {
167 return chunk_stream_
.ReadData(position
, size
, buf
);
170 bool DocumentLoader::IsDataAvailable(uint32 position
, uint32 size
) const {
171 return chunk_stream_
.IsRangeAvailable(position
, size
);
174 void DocumentLoader::RequestData(uint32 position
, uint32 size
) {
175 DCHECK(partial_document_
);
177 // We have some artefact request from
178 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
179 // document is complete.
180 // We need this fix in PDFIum. Adding this as a work around.
181 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
183 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
184 if (IsDocumentComplete())
187 pending_requests_
.push_back(std::pair
<size_t, size_t>(position
, size
));
188 DownloadPendingRequests();
191 void DocumentLoader::DownloadPendingRequests() {
192 if (request_pending_
|| pending_requests_
.empty())
195 // Remove already completed requests.
196 // By design DownloadPendingRequests() should have at least 1 request in the
197 // queue. ReadComplete() will remove the last pending comment from the queue.
198 while (pending_requests_
.size() > 1) {
199 if (IsDataAvailable(pending_requests_
.front().first
,
200 pending_requests_
.front().second
)) {
201 pending_requests_
.pop_front();
207 uint32 pos
= pending_requests_
.front().first
;
208 uint32 size
= pending_requests_
.front().second
;
209 if (IsDataAvailable(pos
, size
)) {
214 // If current request has been partially downloaded already, split it into
215 // a few smaller requests.
216 std::vector
<std::pair
<size_t, size_t> > ranges
;
217 chunk_stream_
.GetMissedRanges(pos
, size
, &ranges
);
218 if (ranges
.size() > 0) {
219 pending_requests_
.pop_front();
220 pending_requests_
.insert(pending_requests_
.begin(),
221 ranges
.begin(), ranges
.end());
222 pos
= pending_requests_
.front().first
;
223 size
= pending_requests_
.front().second
;
226 uint32 cur_request_size
= GetRequestSize();
227 // If size is less than default request, try to expand download range for
228 // more optimal download.
229 if (size
< cur_request_size
&& partial_document_
) {
230 // First, try to expand block towards the end of the file.
231 uint32 new_pos
= pos
;
232 uint32 new_size
= cur_request_size
;
233 if (pos
+ new_size
> document_size_
)
234 new_size
= document_size_
- pos
;
236 std::vector
<std::pair
<size_t, size_t> > ranges
;
237 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
238 new_pos
= ranges
[0].first
;
239 new_size
= ranges
[0].second
;
242 // Second, try to expand block towards the beginning of the file.
243 if (new_size
< cur_request_size
) {
244 uint32 block_end
= new_pos
+ new_size
;
245 if (block_end
> cur_request_size
) {
246 new_pos
= block_end
- cur_request_size
;
250 new_size
= block_end
- new_pos
;
252 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
253 new_pos
= ranges
.back().first
;
254 new_size
= ranges
.back().second
;
261 size_t last_byte_before
= chunk_stream_
.GetLastByteBefore(pos
);
262 size_t first_byte_after
= chunk_stream_
.GetFirstByteAfter(pos
+ size
- 1);
263 if (pos
- last_byte_before
< cur_request_size
) {
264 size
= pos
+ size
- last_byte_before
;
265 pos
= last_byte_before
;
268 if ((pos
+ size
< first_byte_after
) &&
269 (pos
+ size
+ cur_request_size
>= first_byte_after
))
270 size
= first_byte_after
- pos
;
272 request_pending_
= true;
274 // Start downloading first pending request.
276 loader_
= client_
->CreateURLLoader();
277 pp::CompletionCallback callback
=
278 loader_factory_
.NewCallback(&DocumentLoader::DidOpen
);
279 pp::URLRequestInfo request
= GetRequest(pos
, size
);
281 int rv
= loader_
.Open(request
, callback
);
282 if (rv
!= PP_OK_COMPLETIONPENDING
)
286 pp::URLRequestInfo
DocumentLoader::GetRequest(uint32 position
,
288 pp::URLRequestInfo
request(client_
->GetPluginInstance());
289 request
.SetURL(url_
.c_str());
290 request
.SetMethod("GET");
291 request
.SetFollowRedirects(true);
293 const size_t kBufSize
= 100;
295 // According to rfc2616, byte range specifies position of the first and last
296 // bytes in the requested range inclusively. Therefore we should subtract 1
297 // from the position + size, to get index of the last byte that needs to be
299 base::snprintf(buf
, kBufSize
, "Range: bytes=%d-%d", position
,
300 position
+ size
- 1);
302 request
.SetHeaders(header
);
307 void DocumentLoader::DidOpen(int32_t result
) {
308 if (result
!= PP_OK
) {
313 is_multipart_
= false;
314 current_chunk_size_
= 0;
315 current_chunk_read_
= 0;
317 pp::Var headers_var
= loader_
.GetResponseInfo().GetHeaders();
319 if (headers_var
.is_string())
320 headers
= headers_var
.AsString();
322 std::string boundary
= GetMultiPartBoundary(headers
);
323 if (boundary
.size()) {
324 // Leave position untouched for now, when we read the data we'll get it.
325 is_multipart_
= true;
326 multipart_boundary_
= boundary
;
328 // Need to make sure that the server returned a byte-range, since it's
329 // possible for a server to just ignore our bye-range request and just
330 // return the entire document even if it supports byte-range requests.
331 // i.e. sniff response to
332 // http://www.act.org/compass/sample/pdf/geometry.pdf
334 uint32 start_pos
, end_pos
;
335 if (GetByteRange(headers
, &start_pos
, &end_pos
)) {
336 current_pos_
= start_pos
;
337 if (end_pos
&& end_pos
> start_pos
)
338 current_chunk_size_
= end_pos
- start_pos
+ 1;
345 bool DocumentLoader::GetByteRange(const std::string
& headers
, uint32
* start
,
347 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
348 while (it
.GetNext()) {
349 if (LowerCaseEqualsASCII(it
.name(), "content-range")) {
350 std::string range
= it
.values().c_str();
351 if (StartsWithASCII(range
, "bytes", false)) {
352 range
= range
.substr(strlen("bytes"));
353 std::string::size_type pos
= range
.find('-');
354 std::string range_end
;
355 if (pos
!= std::string::npos
)
356 range_end
= range
.substr(pos
+ 1);
357 TrimWhitespaceASCII(range
, base::TRIM_LEADING
, &range
);
358 TrimWhitespaceASCII(range_end
, base::TRIM_LEADING
, &range_end
);
359 *start
= atoi(range
.c_str());
360 *end
= atoi(range_end
.c_str());
368 std::string
DocumentLoader::GetMultiPartBoundary(const std::string
& headers
) {
369 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
370 while (it
.GetNext()) {
371 if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
372 std::string type
= StringToLowerASCII(it
.values());
373 if (StartsWithASCII(type
, "multipart/", true)) {
374 const char* boundary
= strstr(type
.c_str(), "boundary=");
380 return std::string(boundary
+ 9);
384 return std::string();
387 void DocumentLoader::ReadMore() {
388 pp::CompletionCallback callback
=
389 loader_factory_
.NewCallback(&DocumentLoader::DidRead
);
390 int rv
= loader_
.ReadResponseBody(buffer_
, sizeof(buffer_
), callback
);
391 if (rv
!= PP_OK_COMPLETIONPENDING
)
395 void DocumentLoader::DidRead(int32_t result
) {
397 char* start
= buffer_
;
398 size_t length
= result
;
399 if (is_multipart_
&& result
> 2) {
400 for (int i
= 2; i
< result
; ++i
) {
401 if ((buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\n') ||
403 buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\r' &&
404 buffer_
[i
- 3] == '\n' && buffer_
[i
- 4] == '\r')) {
405 uint32 start_pos
, end_pos
;
406 if (GetByteRange(std::string(buffer_
, i
), &start_pos
, &end_pos
)) {
407 current_pos_
= start_pos
;
410 if (end_pos
&& end_pos
> start_pos
)
411 current_chunk_size_
= end_pos
- start_pos
+ 1;
417 // Reset this flag so we don't look inside the buffer in future calls of
418 // DidRead for this response. Note that this code DOES NOT handle multi-
419 // part responses with more than one part (we don't issue them at the
420 // moment, so they shouldn't arrive).
421 is_multipart_
= false;
424 if (current_chunk_size_
&&
425 current_chunk_read_
+ length
> current_chunk_size_
)
426 length
= current_chunk_size_
- current_chunk_read_
;
429 if (document_size_
> 0) {
430 chunk_stream_
.WriteData(current_pos_
, start
, length
);
432 // If we did not get content-length in the response, we can't
433 // preallocate buffer for the entire document. Resizing array causing
434 // memory fragmentation issues on the large files and OOM exceptions.
435 // To fix this, we collect all chunks of the file to the list and
436 // concatenate them together after request is complete.
437 chunk_buffer_
.push_back(std::vector
<unsigned char>());
438 chunk_buffer_
.back().resize(length
);
439 memcpy(&(chunk_buffer_
.back()[0]), start
, length
);
441 current_pos_
+= length
;
442 current_chunk_read_
+= length
;
443 client_
->OnNewDataAvailable();
446 } else if (result
== PP_OK
) {
453 void DocumentLoader::ReadComplete() {
454 if (!partial_document_
) {
455 if (document_size_
== 0) {
456 // For the document with no 'content-length" specified we've collected all
457 // the chunks already. Let's allocate final document buffer and copy them
459 chunk_stream_
.Preallocate(current_pos_
);
461 std::list
<std::vector
<unsigned char> >::iterator it
;
462 for (it
= chunk_buffer_
.begin(); it
!= chunk_buffer_
.end(); ++it
) {
463 chunk_stream_
.WriteData(pos
, &((*it
)[0]), it
->size());
466 chunk_buffer_
.clear();
468 document_size_
= current_pos_
;
469 client_
->OnDocumentComplete();
473 request_pending_
= false;
474 pending_requests_
.pop_front();
476 // If there are more pending request - continue downloading.
477 if (!pending_requests_
.empty()) {
478 DownloadPendingRequests();
482 if (IsDocumentComplete()) {
483 client_
->OnDocumentComplete();
488 client_
->OnPartialDocumentLoaded();
490 client_
->OnPendingRequestComplete();
491 header_request_
= false;
493 // The OnPendingRequestComplete could have added more requests.
494 if (!pending_requests_
.empty()) {
495 DownloadPendingRequests();
497 // Document is not complete and we have no outstanding requests.
498 // Let's keep downloading PDF file in small chunks.
499 uint32 pos
= chunk_stream_
.GetFirstMissingByte();
500 std::vector
<std::pair
<size_t, size_t> > ranges
;
501 chunk_stream_
.GetMissedRanges(pos
, GetRequestSize(), &ranges
);
502 DCHECK(ranges
.size() > 0);
503 RequestData(ranges
[0].first
, ranges
[0].second
);
507 uint32
DocumentLoader::GetRequestSize() const {
508 std::map
<uint32
, uint32
>::const_iterator iter
=
509 chunk_size_table_
.lower_bound(requests_count_
);
510 if (iter
== chunk_size_table_
.end())
515 } // namespace chrome_pdf