pdf/document_loader.cc

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "pdf/document_loader.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "net/http/http_util.h"
  10 #include "ppapi/c/pp_errors.h"
  11 #include "ppapi/cpp/url_loader.h"
  12 #include "ppapi/cpp/url_request_info.h"
  13 #include "ppapi/cpp/url_response_info.h"
  14
  15 namespace chrome_pdf {
  16
  17 namespace {
  18
  19 // Document below size will be downloaded in one chunk.
  20 const uint32_t kMinFileSize = 64 * 1024;
  21
  22 // If the headers have a byte-range response, writes the start and end
  23 // positions and returns true if at least the start position was parsed.
  24 // The end position will be set to 0 if it was not found or parsed from the
  25 // response.
  26 // Returns false if not even a start position could be parsed.
  27 bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) {
  28   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
  29   while (it.GetNext()) {
  30     if (base::LowerCaseEqualsASCII(it.name(), "content-range")) {
  31       std::string range = it.values().c_str();
  32       if (base::StartsWith(range, "bytes",
  33                            base::CompareCase::INSENSITIVE_ASCII)) {
  34         range = range.substr(strlen("bytes"));
  35         std::string::size_type pos = range.find('-');
  36         std::string range_end;
  37         if (pos != std::string::npos)
  38           range_end = range.substr(pos + 1);
  39         TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
  40         TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
  41         *start = atoi(range.c_str());
  42         *end = atoi(range_end.c_str());
  43         return true;
  44       }
  45     }
  46   }
  47   return false;
  48 }
  49
  50 // If the headers have a multi-part response, returns the boundary name.
  51 // Otherwise returns an empty string.
  52 std::string GetMultiPartBoundary(const std::string& headers) {
  53   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
  54   while (it.GetNext()) {
  55     if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
  56       std::string type = base::ToLowerASCII(it.values());
  57       if (base::StartsWith(type, "multipart/", base::CompareCase::SENSITIVE)) {
  58         const char* boundary = strstr(type.c_str(), "boundary=");
  59         if (!boundary) {
  60           NOTREACHED();
  61           break;
  62         }
  63
  64         return std::string(boundary + 9);
  65       }
  66     }
  67   }
  68   return std::string();
  69 }
  70
  71 bool IsValidContentType(const std::string& type) {
  72   return (base::EndsWith(type, "/pdf", base::CompareCase::INSENSITIVE_ASCII) ||
  73           base::EndsWith(type, ".pdf", base::CompareCase::INSENSITIVE_ASCII) ||
  74           base::EndsWith(type, "/x-pdf",
  75                          base::CompareCase::INSENSITIVE_ASCII) ||
  76           base::EndsWith(type, "/*", base::CompareCase::INSENSITIVE_ASCII) ||
  77           base::EndsWith(type, "/acrobat",
  78                          base::CompareCase::INSENSITIVE_ASCII) ||
  79           base::EndsWith(type, "/unknown",
  80                          base::CompareCase::INSENSITIVE_ASCII));
  81 }
  82
  83 }  // namespace
  84
  85 DocumentLoader::Client::~Client() {
  86 }
  87
  88 DocumentLoader::DocumentLoader(Client* client)
  89     : client_(client), partial_document_(false), request_pending_(false),
  90       current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
  91       document_size_(0), header_request_(true), is_multipart_(false) {
  92   loader_factory_.Initialize(this);
  93 }
  94
  95 DocumentLoader::~DocumentLoader() {
  96 }
  97
  98 bool DocumentLoader::Init(const pp::URLLoader& loader,
  99                           const std::string& url,
 100                           const std::string& headers) {
 101   DCHECK(url_.empty());
 102   url_ = url;
 103   loader_ = loader;
 104
 105   std::string response_headers;
 106   if (!headers.empty()) {
 107     response_headers = headers;
 108   } else {
 109     pp::URLResponseInfo response = loader_.GetResponseInfo();
 110     pp::Var headers_var = response.GetHeaders();
 111
 112     if (headers_var.is_string()) {
 113       response_headers = headers_var.AsString();
 114     }
 115   }
 116
 117   bool accept_ranges_bytes = false;
 118   bool content_encoded = false;
 119   uint32_t content_length = 0;
 120   std::string type;
 121   std::string disposition;
 122
 123   // This happens for PDFs not loaded from http(s) sources.
 124   if (response_headers == "Content-Type: text/plain") {
 125     if (!base::StartsWith(url, "http://",
 126                           base::CompareCase::INSENSITIVE_ASCII) &&
 127         !base::StartsWith(url, "https://",
 128                           base::CompareCase::INSENSITIVE_ASCII)) {
 129       type = "application/pdf";
 130     }
 131   }
 132   if (type.empty() && !response_headers.empty()) {
 133     net::HttpUtil::HeadersIterator it(response_headers.begin(),
 134                                       response_headers.end(), "\n");
 135     while (it.GetNext()) {
 136       if (base::LowerCaseEqualsASCII(it.name(), "content-length")) {
 137         content_length = atoi(it.values().c_str());
 138       } else if (base::LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
 139         accept_ranges_bytes = base::LowerCaseEqualsASCII(it.values(), "bytes");
 140       } else if (base::LowerCaseEqualsASCII(it.name(), "content-encoding")) {
 141         content_encoded = true;
 142       } else if (base::LowerCaseEqualsASCII(it.name(), "content-type")) {
 143         type = it.values();
 144         size_t semi_colon_pos = type.find(';');
 145         if (semi_colon_pos != std::string::npos) {
 146           type = type.substr(0, semi_colon_pos);
 147         }
 148         TrimWhitespace(type, base::TRIM_ALL, &type);
 149       } else if (base::LowerCaseEqualsASCII(it.name(), "content-disposition")) {
 150         disposition = it.values();
 151       }
 152     }
 153   }
 154   if (!type.empty() && !IsValidContentType(type))
 155     return false;
 156   if (base::StartsWith(disposition, "attachment",
 157                        base::CompareCase::INSENSITIVE_ASCII))
 158     return false;
 159
 160   if (content_length > 0)
 161     chunk_stream_.Preallocate(content_length);
 162
 163   document_size_ = content_length;
 164   requests_count_ = 0;
 165
 166   // Enable partial loading only if file size is above the threshold.
 167   // It will allow avoiding latency for multiple requests.
 168   if (content_length > kMinFileSize &&
 169       accept_ranges_bytes &&
 170       !content_encoded) {
 171     LoadPartialDocument();
 172   } else {
 173     LoadFullDocument();
 174   }
 175   return true;
 176 }
 177
 178 void DocumentLoader::LoadPartialDocument() {
 179   partial_document_ = true;
 180   // Force the main request to be cancelled, since if we're a full-frame plugin
 181   // there could be other references to the loader.
 182   loader_.Close();
 183   loader_ = pp::URLLoader();
 184   // Download file header.
 185   header_request_ = true;
 186   RequestData(0, std::min(GetRequestSize(), document_size_));
 187 }
 188
 189 void DocumentLoader::LoadFullDocument() {
 190   partial_document_ = false;
 191   chunk_buffer_.clear();
 192   ReadMore();
 193 }
 194
 195 bool DocumentLoader::IsDocumentComplete() const {
 196   if (document_size_ == 0)  // Document size unknown.
 197     return false;
 198   return IsDataAvailable(0, document_size_);
 199 }
 200
 201 uint32_t DocumentLoader::GetAvailableData() const {
 202   if (document_size_ == 0) {  // If document size is unknown.
 203     return current_pos_;
 204   }
 205
 206   std::vector<std::pair<size_t, size_t> > ranges;
 207   chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
 208   uint32_t available = document_size_;
 209   for (const auto& range : ranges)
 210     available -= range.second;
 211   return available;
 212 }
 213
 214 void DocumentLoader::ClearPendingRequests() {
 215   // The first item in the queue is pending (need to keep it in the queue).
 216   if (pending_requests_.size() > 1) {
 217     // Remove all elements except the first one.
 218     pending_requests_.erase(++pending_requests_.begin(),
 219                             pending_requests_.end());
 220   }
 221 }
 222
 223 bool DocumentLoader::GetBlock(uint32_t position,
 224                               uint32_t size,
 225                               void* buf) const {
 226   return chunk_stream_.ReadData(position, size, buf);
 227 }
 228
 229 bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const {
 230   return chunk_stream_.IsRangeAvailable(position, size);
 231 }
 232
 233 void DocumentLoader::RequestData(uint32_t position, uint32_t size) {
 234   DCHECK(partial_document_);
 235
 236   // We have some artefact request from
 237   // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
 238   // document is complete.
 239   // We need this fix in PDFIum. Adding this as a work around.
 240   // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
 241   // Test url:
 242   // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
 243   if (IsDocumentComplete())
 244     return;
 245
 246   pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
 247   DownloadPendingRequests();
 248 }
 249
 250 void DocumentLoader::DownloadPendingRequests() {
 251   if (request_pending_ || pending_requests_.empty())
 252     return;
 253
 254   // Remove already completed requests.
 255   // By design DownloadPendingRequests() should have at least 1 request in the
 256   // queue. ReadComplete() will remove the last pending comment from the queue.
 257   while (pending_requests_.size() > 1) {
 258     if (IsDataAvailable(pending_requests_.front().first,
 259                         pending_requests_.front().second)) {
 260       pending_requests_.pop_front();
 261     } else {
 262       break;
 263     }
 264   }
 265
 266   uint32_t pos = pending_requests_.front().first;
 267   uint32_t size = pending_requests_.front().second;
 268   if (IsDataAvailable(pos, size)) {
 269     ReadComplete();
 270     return;
 271   }
 272
 273   // If current request has been partially downloaded already, split it into
 274   // a few smaller requests.
 275   std::vector<std::pair<size_t, size_t> > ranges;
 276   chunk_stream_.GetMissedRanges(pos, size, &ranges);
 277   if (!ranges.empty()) {
 278     pending_requests_.pop_front();
 279     pending_requests_.insert(pending_requests_.begin(),
 280                              ranges.begin(), ranges.end());
 281     pos = pending_requests_.front().first;
 282     size = pending_requests_.front().second;
 283   }
 284
 285   uint32_t cur_request_size = GetRequestSize();
 286   // If size is less than default request, try to expand download range for
 287   // more optimal download.
 288   if (size < cur_request_size && partial_document_) {
 289     // First, try to expand block towards the end of the file.
 290     uint32_t new_pos = pos;
 291     uint32_t new_size = cur_request_size;
 292     if (pos + new_size > document_size_)
 293       new_size = document_size_ - pos;
 294
 295     std::vector<std::pair<size_t, size_t> > ranges;
 296     if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 297       new_pos = ranges[0].first;
 298       new_size = ranges[0].second;
 299     }
 300
 301     // Second, try to expand block towards the beginning of the file.
 302     if (new_size < cur_request_size) {
 303       uint32_t block_end = new_pos + new_size;
 304       if (block_end > cur_request_size) {
 305         new_pos = block_end - cur_request_size;
 306       } else {
 307         new_pos = 0;
 308       }
 309       new_size = block_end - new_pos;
 310
 311       if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 312         new_pos = ranges.back().first;
 313         new_size = ranges.back().second;
 314       }
 315     }
 316     pos = new_pos;
 317     size = new_size;
 318   }
 319
 320   size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
 321   size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
 322   if (pos - last_byte_before < cur_request_size) {
 323     size = pos + size - last_byte_before;
 324     pos = last_byte_before;
 325   }
 326
 327   if ((pos + size < first_byte_after) &&
 328       (pos + size + cur_request_size >= first_byte_after))
 329     size = first_byte_after - pos;
 330
 331   request_pending_ = true;
 332
 333   // Start downloading first pending request.
 334   loader_.Close();
 335   loader_ = client_->CreateURLLoader();
 336   pp::CompletionCallback callback =
 337       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
 338   pp::URLRequestInfo request = GetRequest(pos, size);
 339   requests_count_++;
 340   int rv = loader_.Open(request, callback);
 341   if (rv != PP_OK_COMPLETIONPENDING)
 342     callback.Run(rv);
 343 }
 344
 345 pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position,
 346                                               uint32_t size) const {
 347   pp::URLRequestInfo request(client_->GetPluginInstance());
 348   request.SetURL(url_);
 349   request.SetMethod("GET");
 350   request.SetFollowRedirects(true);
 351   request.SetCustomReferrerURL(url_);
 352
 353   const size_t kBufSize = 100;
 354   char buf[kBufSize];
 355   // According to rfc2616, byte range specifies position of the first and last
 356   // bytes in the requested range inclusively. Therefore we should subtract 1
 357   // from the position + size, to get index of the last byte that needs to be
 358   // downloaded.
 359   base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
 360                  position + size - 1);
 361   pp::Var header(buf);
 362   request.SetHeaders(header);
 363
 364   return request;
 365 }
 366
 367 void DocumentLoader::DidOpen(int32_t result) {
 368   if (result != PP_OK) {
 369     NOTREACHED();
 370     return;
 371   }
 372
 373   int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
 374   if (http_code >= 400 && http_code < 500) {
 375     // Error accessing resource. 4xx error indicate subsequent requests
 376     // will fail too.
 377     // E.g. resource has been removed from the server while loading it.
 378     // https://code.google.com/p/chromium/issues/detail?id=414827
 379     return;
 380   }
 381
 382   is_multipart_ = false;
 383   current_chunk_size_ = 0;
 384   current_chunk_read_ = 0;
 385
 386   pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
 387   std::string headers;
 388   if (headers_var.is_string())
 389     headers = headers_var.AsString();
 390
 391   std::string boundary = GetMultiPartBoundary(headers);
 392   if (!boundary.empty()) {
 393     // Leave position untouched for now, when we read the data we'll get it.
 394     is_multipart_ = true;
 395     multipart_boundary_ = boundary;
 396   } else {
 397     // Need to make sure that the server returned a byte-range, since it's
 398     // possible for a server to just ignore our bye-range request and just
 399     // return the entire document even if it supports byte-range requests.
 400     // i.e. sniff response to
 401     // http://www.act.org/compass/sample/pdf/geometry.pdf
 402     current_pos_ = 0;
 403     uint32_t start_pos, end_pos;
 404     if (GetByteRange(headers, &start_pos, &end_pos)) {
 405       current_pos_ = start_pos;
 406       if (end_pos && end_pos > start_pos)
 407         current_chunk_size_ = end_pos - start_pos + 1;
 408     }
 409   }
 410
 411   ReadMore();
 412 }
 413
 414 void DocumentLoader::ReadMore() {
 415   pp::CompletionCallback callback =
 416         loader_factory_.NewCallback(&DocumentLoader::DidRead);
 417   int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
 418   if (rv != PP_OK_COMPLETIONPENDING)
 419     callback.Run(rv);
 420 }
 421
 422 void DocumentLoader::DidRead(int32_t result) {
 423   if (result > 0) {
 424     char* start = buffer_;
 425     size_t length = result;
 426     if (is_multipart_ && result > 2) {
 427       for (int i = 2; i < result; ++i) {
 428         if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
 429             (i >= 4 &&
 430              buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
 431              buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
 432           uint32_t start_pos, end_pos;
 433           if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
 434             current_pos_ = start_pos;
 435             start += i;
 436             length -= i;
 437             if (end_pos && end_pos > start_pos)
 438               current_chunk_size_ = end_pos - start_pos + 1;
 439           }
 440           break;
 441         }
 442       }
 443
 444       // Reset this flag so we don't look inside the buffer in future calls of
 445       // DidRead for this response.  Note that this code DOES NOT handle multi-
 446       // part responses with more than one part (we don't issue them at the
 447       // moment, so they shouldn't arrive).
 448       is_multipart_ = false;
 449     }
 450
 451     if (current_chunk_size_ &&
 452         current_chunk_read_ + length > current_chunk_size_)
 453       length = current_chunk_size_ - current_chunk_read_;
 454
 455     if (length) {
 456       if (document_size_ > 0) {
 457         chunk_stream_.WriteData(current_pos_, start, length);
 458       } else {
 459         // If we did not get content-length in the response, we can't
 460         // preallocate buffer for the entire document. Resizing array causing
 461         // memory fragmentation issues on the large files and OOM exceptions.
 462         // To fix this, we collect all chunks of the file to the list and
 463         // concatenate them together after request is complete.
 464         chunk_buffer_.push_back(std::vector<unsigned char>());
 465         chunk_buffer_.back().resize(length);
 466         memcpy(&(chunk_buffer_.back()[0]), start, length);
 467       }
 468       current_pos_ += length;
 469       current_chunk_read_ += length;
 470       client_->OnNewDataAvailable();
 471     }
 472     ReadMore();
 473   } else if (result == PP_OK) {
 474     ReadComplete();
 475   } else {
 476     NOTREACHED();
 477   }
 478 }
 479
 480 void DocumentLoader::ReadComplete() {
 481   if (!partial_document_) {
 482     if (document_size_ == 0) {
 483       // For the document with no 'content-length" specified we've collected all
 484       // the chunks already. Let's allocate final document buffer and copy them
 485       // over.
 486       chunk_stream_.Preallocate(current_pos_);
 487       uint32_t pos = 0;
 488       for (auto& chunk : chunk_buffer_) {
 489         chunk_stream_.WriteData(pos, &(chunk[0]), chunk.size());
 490         pos += chunk.size();
 491       }
 492       chunk_buffer_.clear();
 493     }
 494     document_size_ = current_pos_;
 495     client_->OnDocumentComplete();
 496     return;
 497   }
 498
 499   request_pending_ = false;
 500   pending_requests_.pop_front();
 501
 502   // If there are more pending request - continue downloading.
 503   if (!pending_requests_.empty()) {
 504     DownloadPendingRequests();
 505     return;
 506   }
 507
 508   if (IsDocumentComplete()) {
 509     client_->OnDocumentComplete();
 510     return;
 511   }
 512
 513   if (header_request_)
 514     client_->OnPartialDocumentLoaded();
 515   else
 516     client_->OnPendingRequestComplete();
 517   header_request_ = false;
 518
 519   // The OnPendingRequestComplete could have added more requests.
 520   if (!pending_requests_.empty()) {
 521     DownloadPendingRequests();
 522   } else {
 523     // Document is not complete and we have no outstanding requests.
 524     // Let's keep downloading PDF file in small chunks.
 525     uint32_t pos = chunk_stream_.GetFirstMissingByte();
 526     std::vector<std::pair<size_t, size_t> > ranges;
 527     chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
 528     DCHECK(!ranges.empty());
 529     RequestData(ranges[0].first, ranges[0].second);
 530   }
 531 }
 532
 533 uint32_t DocumentLoader::GetRequestSize() const {
 534   // Document loading strategy:
 535   // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
 536   // double the size (64k), and so on, until we cap max request size at 2M for
 537   // 71 or more requests.
 538   uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u);
 539   return 32 * 1024 * (1 << ((limited_count - 1) / 10u));
 540 }
 541
 542 }  // namespace chrome_pdf