pdf/document_loader.cc

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "pdf/document_loader.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "net/http/http_util.h"
  10 #include "ppapi/c/pp_errors.h"
  11 #include "ppapi/cpp/url_loader.h"
  12 #include "ppapi/cpp/url_request_info.h"
  13 #include "ppapi/cpp/url_response_info.h"
  14
  15 namespace chrome_pdf {
  16
  17 // Document below size will be downloaded in one chunk.
  18 const uint32 kMinFileSize = 64*1024;
  19
  20 DocumentLoader::DocumentLoader(Client* client)
  21     : client_(client), partial_document_(false), request_pending_(false),
  22       current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
  23       document_size_(0), header_request_(true), is_multipart_(false) {
  24   loader_factory_.Initialize(this);
  25 }
  26
  27 DocumentLoader::~DocumentLoader() {
  28 }
  29
  30 bool DocumentLoader::Init(const pp::URLLoader& loader,
  31                           const std::string& url,
  32                           const std::string& headers) {
  33   DCHECK(url_.empty());
  34   url_ = url;
  35   loader_ = loader;
  36
  37   std::string response_headers;
  38   if (!headers.empty()) {
  39     response_headers = headers;
  40   } else {
  41     pp::URLResponseInfo response = loader_.GetResponseInfo();
  42     pp::Var headers_var = response.GetHeaders();
  43
  44     if (headers_var.is_string()) {
  45       response_headers = headers_var.AsString();
  46     }
  47   }
  48
  49   bool accept_ranges_bytes = false;
  50   bool content_encoded = false;
  51   uint32 content_length = 0;
  52   std::string type;
  53   std::string disposition;
  54   if (!response_headers.empty()) {
  55     net::HttpUtil::HeadersIterator it(response_headers.begin(),
  56                                       response_headers.end(), "\n");
  57     while (it.GetNext()) {
  58       if (LowerCaseEqualsASCII(it.name(), "content-length")) {
  59         content_length = atoi(it.values().c_str());
  60       } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
  61         accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
  62       } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
  63         content_encoded = true;
  64       } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
  65         type = it.values();
  66         size_t semi_colon_pos = type.find(';');
  67         if (semi_colon_pos != std::string::npos) {
  68           type = type.substr(0, semi_colon_pos);
  69         }
  70         TrimWhitespace(type, base::TRIM_ALL, &type);
  71       } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
  72         disposition = it.values();
  73       }
  74     }
  75   }
  76   if (!type.empty() &&
  77       !EndsWith(type, "/pdf", false) &&
  78       !EndsWith(type, ".pdf", false) &&
  79       !EndsWith(type, "/x-pdf", false) &&
  80       !EndsWith(type, "/*", false) &&
  81       !EndsWith(type, "/acrobat", false) &&
  82       !EndsWith(type, "/unknown", false)) {
  83     return false;
  84   }
  85   if (StartsWithASCII(disposition, "attachment", false)) {
  86     return false;
  87   }
  88
  89   if (content_length > 0)
  90     chunk_stream_.Preallocate(content_length);
  91
  92   document_size_ = content_length;
  93   requests_count_ = 0;
  94
  95   // Enable partial loading only if file size is above the threshold.
  96   // It will allow avoiding latency for multiple requests.
  97   if (content_length > kMinFileSize &&
  98       accept_ranges_bytes &&
  99       !content_encoded) {
 100     LoadPartialDocument();
 101   } else {
 102     LoadFullDocument();
 103   }
 104   return true;
 105 }
 106
 107 void DocumentLoader::LoadPartialDocument() {
 108   partial_document_ = true;
 109   // Force the main request to be cancelled, since if we're a full-frame plugin
 110   // there could be other references to the loader.
 111   loader_.Close();
 112   loader_ = pp::URLLoader();
 113   // Download file header.
 114   header_request_ = true;
 115   RequestData(0, std::min(GetRequestSize(), document_size_));
 116 }
 117
 118 void DocumentLoader::LoadFullDocument() {
 119   partial_document_ = false;
 120   chunk_buffer_.clear();
 121   ReadMore();
 122 }
 123
 124 bool DocumentLoader::IsDocumentComplete() const {
 125   if (document_size_ == 0)  // Document size unknown.
 126     return false;
 127   return IsDataAvailable(0, document_size_);
 128 }
 129
 130 uint32 DocumentLoader::GetAvailableData() const {
 131   if (document_size_ == 0) {  // If document size is unknown.
 132     return current_pos_;
 133   }
 134
 135   std::vector<std::pair<size_t, size_t> > ranges;
 136   chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
 137   uint32 available = document_size_;
 138   std::vector<std::pair<size_t, size_t> >::iterator it;
 139   for (it = ranges.begin(); it != ranges.end(); ++it) {
 140     available -= it->second;
 141   }
 142   return available;
 143 }
 144
 145 void DocumentLoader::ClearPendingRequests() {
 146   // The first item in the queue is pending (need to keep it in the queue).
 147   if (pending_requests_.size() > 1) {
 148     // Remove all elements except the first one.
 149     pending_requests_.erase(++pending_requests_.begin(),
 150                             pending_requests_.end());
 151   }
 152 }
 153
 154 bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
 155   return chunk_stream_.ReadData(position, size, buf);
 156 }
 157
 158 bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
 159   return chunk_stream_.IsRangeAvailable(position, size);
 160 }
 161
 162 void DocumentLoader::RequestData(uint32 position, uint32 size) {
 163   DCHECK(partial_document_);
 164
 165   // We have some artefact request from
 166   // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
 167   // document is complete.
 168   // We need this fix in PDFIum. Adding this as a work around.
 169   // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
 170   // Test url:
 171   // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
 172   if (IsDocumentComplete())
 173     return;
 174
 175   pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
 176   DownloadPendingRequests();
 177 }
 178
 179 void DocumentLoader::DownloadPendingRequests() {
 180   if (request_pending_ || pending_requests_.empty())
 181     return;
 182
 183   // Remove already completed requests.
 184   // By design DownloadPendingRequests() should have at least 1 request in the
 185   // queue. ReadComplete() will remove the last pending comment from the queue.
 186   while (pending_requests_.size() > 1) {
 187     if (IsDataAvailable(pending_requests_.front().first,
 188                         pending_requests_.front().second)) {
 189       pending_requests_.pop_front();
 190     } else {
 191       break;
 192     }
 193   }
 194
 195   uint32 pos = pending_requests_.front().first;
 196   uint32 size = pending_requests_.front().second;
 197   if (IsDataAvailable(pos, size)) {
 198     ReadComplete();
 199     return;
 200   }
 201
 202   // If current request has been partially downloaded already, split it into
 203   // a few smaller requests.
 204   std::vector<std::pair<size_t, size_t> > ranges;
 205   chunk_stream_.GetMissedRanges(pos, size, &ranges);
 206   if (ranges.size() > 0) {
 207     pending_requests_.pop_front();
 208     pending_requests_.insert(pending_requests_.begin(),
 209                              ranges.begin(), ranges.end());
 210     pos = pending_requests_.front().first;
 211     size = pending_requests_.front().second;
 212   }
 213
 214   uint32 cur_request_size = GetRequestSize();
 215   // If size is less than default request, try to expand download range for
 216   // more optimal download.
 217   if (size < cur_request_size && partial_document_) {
 218     // First, try to expand block towards the end of the file.
 219     uint32 new_pos = pos;
 220     uint32 new_size = cur_request_size;
 221     if (pos + new_size > document_size_)
 222       new_size = document_size_ - pos;
 223
 224     std::vector<std::pair<size_t, size_t> > ranges;
 225     if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 226       new_pos = ranges[0].first;
 227       new_size = ranges[0].second;
 228     }
 229
 230     // Second, try to expand block towards the beginning of the file.
 231     if (new_size < cur_request_size) {
 232       uint32 block_end = new_pos + new_size;
 233       if (block_end > cur_request_size) {
 234         new_pos = block_end - cur_request_size;
 235       } else {
 236         new_pos = 0;
 237       }
 238       new_size = block_end - new_pos;
 239
 240       if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 241         new_pos = ranges.back().first;
 242         new_size = ranges.back().second;
 243       }
 244     }
 245     pos = new_pos;
 246     size = new_size;
 247   }
 248
 249   size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
 250   size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
 251   if (pos - last_byte_before < cur_request_size) {
 252     size = pos + size - last_byte_before;
 253     pos = last_byte_before;
 254   }
 255
 256   if ((pos + size < first_byte_after) &&
 257       (pos + size + cur_request_size >= first_byte_after))
 258     size = first_byte_after - pos;
 259
 260   request_pending_ = true;
 261
 262   // Start downloading first pending request.
 263   loader_.Close();
 264   loader_ = client_->CreateURLLoader();
 265   pp::CompletionCallback callback =
 266       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
 267   pp::URLRequestInfo request = GetRequest(pos, size);
 268   requests_count_++;
 269   int rv = loader_.Open(request, callback);
 270   if (rv != PP_OK_COMPLETIONPENDING)
 271     callback.Run(rv);
 272 }
 273
 274 pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
 275                                               uint32 size) const {
 276   pp::URLRequestInfo request(client_->GetPluginInstance());
 277   request.SetURL(url_);
 278   request.SetMethod("GET");
 279   request.SetFollowRedirects(true);
 280   request.SetCustomReferrerURL(url_);
 281
 282   const size_t kBufSize = 100;
 283   char buf[kBufSize];
 284   // According to rfc2616, byte range specifies position of the first and last
 285   // bytes in the requested range inclusively. Therefore we should subtract 1
 286   // from the position + size, to get index of the last byte that needs to be
 287   // downloaded.
 288   base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
 289                  position + size - 1);
 290   pp::Var header(buf);
 291   request.SetHeaders(header);
 292
 293   return request;
 294 }
 295
 296 void DocumentLoader::DidOpen(int32_t result) {
 297   if (result != PP_OK) {
 298     NOTREACHED();
 299     return;
 300   }
 301
 302   int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
 303   if (http_code >= 400 && http_code < 500) {
 304     // Error accessing resource. 4xx error indicate subsequent requests
 305     // will fail too.
 306     // E.g. resource has been removed from the server while loading it.
 307     // https://code.google.com/p/chromium/issues/detail?id=414827
 308     return;
 309   }
 310
 311   is_multipart_ = false;
 312   current_chunk_size_ = 0;
 313   current_chunk_read_ = 0;
 314
 315   pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
 316   std::string headers;
 317   if (headers_var.is_string())
 318     headers = headers_var.AsString();
 319
 320   std::string boundary = GetMultiPartBoundary(headers);
 321   if (boundary.size()) {
 322     // Leave position untouched for now, when we read the data we'll get it.
 323     is_multipart_ = true;
 324     multipart_boundary_ = boundary;
 325   } else {
 326     // Need to make sure that the server returned a byte-range, since it's
 327     // possible for a server to just ignore our bye-range request and just
 328     // return the entire document even if it supports byte-range requests.
 329     // i.e. sniff response to
 330     // http://www.act.org/compass/sample/pdf/geometry.pdf
 331     current_pos_ = 0;
 332     uint32 start_pos, end_pos;
 333     if (GetByteRange(headers, &start_pos, &end_pos)) {
 334       current_pos_ = start_pos;
 335       if (end_pos && end_pos > start_pos)
 336         current_chunk_size_ = end_pos - start_pos + 1;
 337     }
 338   }
 339
 340   ReadMore();
 341 }
 342
 343 bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
 344                                   uint32* end) {
 345   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
 346   while (it.GetNext()) {
 347     if (LowerCaseEqualsASCII(it.name(), "content-range")) {
 348       std::string range = it.values().c_str();
 349       if (StartsWithASCII(range, "bytes", false)) {
 350         range = range.substr(strlen("bytes"));
 351         std::string::size_type pos = range.find('-');
 352         std::string range_end;
 353         if (pos != std::string::npos)
 354           range_end = range.substr(pos + 1);
 355         TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
 356         TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
 357         *start = atoi(range.c_str());
 358         *end = atoi(range_end.c_str());
 359         return true;
 360       }
 361     }
 362   }
 363   return false;
 364 }
 365
 366 std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
 367   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
 368   while (it.GetNext()) {
 369     if (LowerCaseEqualsASCII(it.name(), "content-type")) {
 370       std::string type = base::StringToLowerASCII(it.values());
 371       if (StartsWithASCII(type, "multipart/", true)) {
 372         const char* boundary = strstr(type.c_str(), "boundary=");
 373         if (!boundary) {
 374           NOTREACHED();
 375           break;
 376         }
 377
 378         return std::string(boundary + 9);
 379       }
 380     }
 381   }
 382   return std::string();
 383 }
 384
 385 void DocumentLoader::ReadMore() {
 386   pp::CompletionCallback callback =
 387         loader_factory_.NewCallback(&DocumentLoader::DidRead);
 388   int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
 389   if (rv != PP_OK_COMPLETIONPENDING)
 390     callback.Run(rv);
 391 }
 392
 393 void DocumentLoader::DidRead(int32_t result) {
 394   if (result > 0) {
 395     char* start = buffer_;
 396     size_t length = result;
 397     if (is_multipart_ && result > 2) {
 398       for (int i = 2; i < result; ++i) {
 399         if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
 400             (i >= 4 &&
 401              buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
 402              buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
 403           uint32 start_pos, end_pos;
 404           if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
 405             current_pos_ = start_pos;
 406             start += i;
 407             length -= i;
 408             if (end_pos && end_pos > start_pos)
 409               current_chunk_size_ = end_pos - start_pos + 1;
 410           }
 411           break;
 412         }
 413       }
 414
 415       // Reset this flag so we don't look inside the buffer in future calls of
 416       // DidRead for this response.  Note that this code DOES NOT handle multi-
 417       // part responses with more than one part (we don't issue them at the
 418       // moment, so they shouldn't arrive).
 419       is_multipart_ = false;
 420     }
 421
 422     if (current_chunk_size_ &&
 423         current_chunk_read_ + length > current_chunk_size_)
 424       length = current_chunk_size_ - current_chunk_read_;
 425
 426     if (length) {
 427       if (document_size_ > 0) {
 428         chunk_stream_.WriteData(current_pos_, start, length);
 429       } else {
 430         // If we did not get content-length in the response, we can't
 431         // preallocate buffer for the entire document. Resizing array causing
 432         // memory fragmentation issues on the large files and OOM exceptions.
 433         // To fix this, we collect all chunks of the file to the list and
 434         // concatenate them together after request is complete.
 435         chunk_buffer_.push_back(std::vector<unsigned char>());
 436         chunk_buffer_.back().resize(length);
 437         memcpy(&(chunk_buffer_.back()[0]), start, length);
 438       }
 439       current_pos_ += length;
 440       current_chunk_read_ += length;
 441       client_->OnNewDataAvailable();
 442     }
 443     ReadMore();
 444   } else if (result == PP_OK) {
 445     ReadComplete();
 446   } else {
 447     NOTREACHED();
 448   }
 449 }
 450
 451 void DocumentLoader::ReadComplete() {
 452   if (!partial_document_) {
 453     if (document_size_ == 0) {
 454       // For the document with no 'content-length" specified we've collected all
 455       // the chunks already. Let's allocate final document buffer and copy them
 456       // over.
 457       chunk_stream_.Preallocate(current_pos_);
 458       uint32 pos = 0;
 459       std::list<std::vector<unsigned char> >::iterator it;
 460       for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
 461         chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
 462         pos += it->size();
 463       }
 464       chunk_buffer_.clear();
 465     }
 466     document_size_ = current_pos_;
 467     client_->OnDocumentComplete();
 468     return;
 469   }
 470
 471   request_pending_ = false;
 472   pending_requests_.pop_front();
 473
 474   // If there are more pending request - continue downloading.
 475   if (!pending_requests_.empty()) {
 476     DownloadPendingRequests();
 477     return;
 478   }
 479
 480   if (IsDocumentComplete()) {
 481     client_->OnDocumentComplete();
 482     return;
 483   }
 484
 485   if (header_request_)
 486     client_->OnPartialDocumentLoaded();
 487   else
 488     client_->OnPendingRequestComplete();
 489   header_request_ = false;
 490
 491   // The OnPendingRequestComplete could have added more requests.
 492   if (!pending_requests_.empty()) {
 493     DownloadPendingRequests();
 494   } else {
 495     // Document is not complete and we have no outstanding requests.
 496     // Let's keep downloading PDF file in small chunks.
 497     uint32 pos = chunk_stream_.GetFirstMissingByte();
 498     std::vector<std::pair<size_t, size_t> > ranges;
 499     chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
 500     DCHECK(ranges.size() > 0);
 501     RequestData(ranges[0].first, ranges[0].second);
 502   }
 503 }
 504
 505 uint32 DocumentLoader::GetRequestSize() const {
 506   // Document loading strategy:
 507   // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
 508   // double the size (64k), and so on, until we cap max request size at 2M for
 509   // 71 or more requests.
 510   uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
 511   return 32*1024 * (1 << ((limited_count - 1) / 10u));
 512 }
 513
 514 }  // namespace chrome_pdf