pdf/document_loader.cc

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "pdf/document_loader.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "net/http/http_util.h"
  10 #include "ppapi/c/pp_errors.h"
  11 #include "ppapi/cpp/url_loader.h"
  12 #include "ppapi/cpp/url_request_info.h"
  13 #include "ppapi/cpp/url_response_info.h"
  14
  15 namespace chrome_pdf {
  16
  17 namespace {
  18
  19 // Document below size will be downloaded in one chunk.
  20 const uint32_t kMinFileSize = 64 * 1024;
  21
  22 // If the headers have a byte-range response, writes the start and end
  23 // positions and returns true if at least the start position was parsed.
  24 // The end position will be set to 0 if it was not found or parsed from the
  25 // response.
  26 // Returns false if not even a start position could be parsed.
  27 bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) {
  28   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
  29   while (it.GetNext()) {
  30     if (LowerCaseEqualsASCII(it.name(), "content-range")) {
  31       std::string range = it.values().c_str();
  32       if (StartsWithASCII(range, "bytes", false)) {
  33         range = range.substr(strlen("bytes"));
  34         std::string::size_type pos = range.find('-');
  35         std::string range_end;
  36         if (pos != std::string::npos)
  37           range_end = range.substr(pos + 1);
  38         TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
  39         TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
  40         *start = atoi(range.c_str());
  41         *end = atoi(range_end.c_str());
  42         return true;
  43       }
  44     }
  45   }
  46   return false;
  47 }
  48
  49 // If the headers have a multi-part response, returns the boundary name.
  50 // Otherwise returns an empty string.
  51 std::string GetMultiPartBoundary(const std::string& headers) {
  52   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
  53   while (it.GetNext()) {
  54     if (LowerCaseEqualsASCII(it.name(), "content-type")) {
  55       std::string type = base::StringToLowerASCII(it.values());
  56       if (StartsWithASCII(type, "multipart/", true)) {
  57         const char* boundary = strstr(type.c_str(), "boundary=");
  58         if (!boundary) {
  59           NOTREACHED();
  60           break;
  61         }
  62
  63         return std::string(boundary + 9);
  64       }
  65     }
  66   }
  67   return std::string();
  68 }
  69
  70 bool IsValidContentType(const std::string& type) {
  71   return (EndsWith(type, "/pdf", false) ||
  72           EndsWith(type, ".pdf", false) ||
  73           EndsWith(type, "/x-pdf", false) ||
  74           EndsWith(type, "/*", false) ||
  75           EndsWith(type, "/acrobat", false) ||
  76           EndsWith(type, "/unknown", false));
  77 }
  78
  79 }  // namespace
  80
  81 DocumentLoader::Client::~Client() {
  82 }
  83
  84 DocumentLoader::DocumentLoader(Client* client)
  85     : client_(client), partial_document_(false), request_pending_(false),
  86       current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
  87       document_size_(0), header_request_(true), is_multipart_(false) {
  88   loader_factory_.Initialize(this);
  89 }
  90
  91 DocumentLoader::~DocumentLoader() {
  92 }
  93
  94 bool DocumentLoader::Init(const pp::URLLoader& loader,
  95                           const std::string& url,
  96                           const std::string& headers) {
  97   DCHECK(url_.empty());
  98   url_ = url;
  99   loader_ = loader;
 100
 101   std::string response_headers;
 102   if (!headers.empty()) {
 103     response_headers = headers;
 104   } else {
 105     pp::URLResponseInfo response = loader_.GetResponseInfo();
 106     pp::Var headers_var = response.GetHeaders();
 107
 108     if (headers_var.is_string()) {
 109       response_headers = headers_var.AsString();
 110     }
 111   }
 112
 113   bool accept_ranges_bytes = false;
 114   bool content_encoded = false;
 115   uint32_t content_length = 0;
 116   std::string type;
 117   std::string disposition;
 118
 119   // This happens for PDFs not loaded from http(s) sources.
 120   if (response_headers == "Content-Type: text/plain") {
 121     if (!StartsWithASCII(url, "http://", false) &&
 122         !StartsWithASCII(url, "https://", false)) {
 123       type = "application/pdf";
 124     }
 125   }
 126   if (type.empty() && !response_headers.empty()) {
 127     net::HttpUtil::HeadersIterator it(response_headers.begin(),
 128                                       response_headers.end(), "\n");
 129     while (it.GetNext()) {
 130       if (LowerCaseEqualsASCII(it.name(), "content-length")) {
 131         content_length = atoi(it.values().c_str());
 132       } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
 133         accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
 134       } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
 135         content_encoded = true;
 136       } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
 137         type = it.values();
 138         size_t semi_colon_pos = type.find(';');
 139         if (semi_colon_pos != std::string::npos) {
 140           type = type.substr(0, semi_colon_pos);
 141         }
 142         TrimWhitespace(type, base::TRIM_ALL, &type);
 143       } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
 144         disposition = it.values();
 145       }
 146     }
 147   }
 148   if (!type.empty() && !IsValidContentType(type))
 149     return false;
 150   if (StartsWithASCII(disposition, "attachment", false))
 151     return false;
 152
 153   if (content_length > 0)
 154     chunk_stream_.Preallocate(content_length);
 155
 156   document_size_ = content_length;
 157   requests_count_ = 0;
 158
 159   // Enable partial loading only if file size is above the threshold.
 160   // It will allow avoiding latency for multiple requests.
 161   if (content_length > kMinFileSize &&
 162       accept_ranges_bytes &&
 163       !content_encoded) {
 164     LoadPartialDocument();
 165   } else {
 166     LoadFullDocument();
 167   }
 168   return true;
 169 }
 170
 171 void DocumentLoader::LoadPartialDocument() {
 172   partial_document_ = true;
 173   // Force the main request to be cancelled, since if we're a full-frame plugin
 174   // there could be other references to the loader.
 175   loader_.Close();
 176   loader_ = pp::URLLoader();
 177   // Download file header.
 178   header_request_ = true;
 179   RequestData(0, std::min(GetRequestSize(), document_size_));
 180 }
 181
 182 void DocumentLoader::LoadFullDocument() {
 183   partial_document_ = false;
 184   chunk_buffer_.clear();
 185   ReadMore();
 186 }
 187
 188 bool DocumentLoader::IsDocumentComplete() const {
 189   if (document_size_ == 0)  // Document size unknown.
 190     return false;
 191   return IsDataAvailable(0, document_size_);
 192 }
 193
 194 uint32_t DocumentLoader::GetAvailableData() const {
 195   if (document_size_ == 0) {  // If document size is unknown.
 196     return current_pos_;
 197   }
 198
 199   std::vector<std::pair<size_t, size_t> > ranges;
 200   chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
 201   uint32_t available = document_size_;
 202   for (const auto& range : ranges)
 203     available -= range.second;
 204   return available;
 205 }
 206
 207 void DocumentLoader::ClearPendingRequests() {
 208   // The first item in the queue is pending (need to keep it in the queue).
 209   if (pending_requests_.size() > 1) {
 210     // Remove all elements except the first one.
 211     pending_requests_.erase(++pending_requests_.begin(),
 212                             pending_requests_.end());
 213   }
 214 }
 215
 216 bool DocumentLoader::GetBlock(uint32_t position,
 217                               uint32_t size,
 218                               void* buf) const {
 219   return chunk_stream_.ReadData(position, size, buf);
 220 }
 221
 222 bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const {
 223   return chunk_stream_.IsRangeAvailable(position, size);
 224 }
 225
 226 void DocumentLoader::RequestData(uint32_t position, uint32_t size) {
 227   DCHECK(partial_document_);
 228
 229   // We have some artefact request from
 230   // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
 231   // document is complete.
 232   // We need this fix in PDFIum. Adding this as a work around.
 233   // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
 234   // Test url:
 235   // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
 236   if (IsDocumentComplete())
 237     return;
 238
 239   pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
 240   DownloadPendingRequests();
 241 }
 242
 243 void DocumentLoader::DownloadPendingRequests() {
 244   if (request_pending_ || pending_requests_.empty())
 245     return;
 246
 247   // Remove already completed requests.
 248   // By design DownloadPendingRequests() should have at least 1 request in the
 249   // queue. ReadComplete() will remove the last pending comment from the queue.
 250   while (pending_requests_.size() > 1) {
 251     if (IsDataAvailable(pending_requests_.front().first,
 252                         pending_requests_.front().second)) {
 253       pending_requests_.pop_front();
 254     } else {
 255       break;
 256     }
 257   }
 258
 259   uint32_t pos = pending_requests_.front().first;
 260   uint32_t size = pending_requests_.front().second;
 261   if (IsDataAvailable(pos, size)) {
 262     ReadComplete();
 263     return;
 264   }
 265
 266   // If current request has been partially downloaded already, split it into
 267   // a few smaller requests.
 268   std::vector<std::pair<size_t, size_t> > ranges;
 269   chunk_stream_.GetMissedRanges(pos, size, &ranges);
 270   if (!ranges.empty()) {
 271     pending_requests_.pop_front();
 272     pending_requests_.insert(pending_requests_.begin(),
 273                              ranges.begin(), ranges.end());
 274     pos = pending_requests_.front().first;
 275     size = pending_requests_.front().second;
 276   }
 277
 278   uint32_t cur_request_size = GetRequestSize();
 279   // If size is less than default request, try to expand download range for
 280   // more optimal download.
 281   if (size < cur_request_size && partial_document_) {
 282     // First, try to expand block towards the end of the file.
 283     uint32_t new_pos = pos;
 284     uint32_t new_size = cur_request_size;
 285     if (pos + new_size > document_size_)
 286       new_size = document_size_ - pos;
 287
 288     std::vector<std::pair<size_t, size_t> > ranges;
 289     if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 290       new_pos = ranges[0].first;
 291       new_size = ranges[0].second;
 292     }
 293
 294     // Second, try to expand block towards the beginning of the file.
 295     if (new_size < cur_request_size) {
 296       uint32_t block_end = new_pos + new_size;
 297       if (block_end > cur_request_size) {
 298         new_pos = block_end - cur_request_size;
 299       } else {
 300         new_pos = 0;
 301       }
 302       new_size = block_end - new_pos;
 303
 304       if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
 305         new_pos = ranges.back().first;
 306         new_size = ranges.back().second;
 307       }
 308     }
 309     pos = new_pos;
 310     size = new_size;
 311   }
 312
 313   size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
 314   size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
 315   if (pos - last_byte_before < cur_request_size) {
 316     size = pos + size - last_byte_before;
 317     pos = last_byte_before;
 318   }
 319
 320   if ((pos + size < first_byte_after) &&
 321       (pos + size + cur_request_size >= first_byte_after))
 322     size = first_byte_after - pos;
 323
 324   request_pending_ = true;
 325
 326   // Start downloading first pending request.
 327   loader_.Close();
 328   loader_ = client_->CreateURLLoader();
 329   pp::CompletionCallback callback =
 330       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
 331   pp::URLRequestInfo request = GetRequest(pos, size);
 332   requests_count_++;
 333   int rv = loader_.Open(request, callback);
 334   if (rv != PP_OK_COMPLETIONPENDING)
 335     callback.Run(rv);
 336 }
 337
 338 pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position,
 339                                               uint32_t size) const {
 340   pp::URLRequestInfo request(client_->GetPluginInstance());
 341   request.SetURL(url_);
 342   request.SetMethod("GET");
 343   request.SetFollowRedirects(true);
 344   request.SetCustomReferrerURL(url_);
 345
 346   const size_t kBufSize = 100;
 347   char buf[kBufSize];
 348   // According to rfc2616, byte range specifies position of the first and last
 349   // bytes in the requested range inclusively. Therefore we should subtract 1
 350   // from the position + size, to get index of the last byte that needs to be
 351   // downloaded.
 352   base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
 353                  position + size - 1);
 354   pp::Var header(buf);
 355   request.SetHeaders(header);
 356
 357   return request;
 358 }
 359
 360 void DocumentLoader::DidOpen(int32_t result) {
 361   if (result != PP_OK) {
 362     NOTREACHED();
 363     return;
 364   }
 365
 366   int32_t http_code = loader_.GetResponseInfo().GetStatusCode();
 367   if (http_code >= 400 && http_code < 500) {
 368     // Error accessing resource. 4xx error indicate subsequent requests
 369     // will fail too.
 370     // E.g. resource has been removed from the server while loading it.
 371     // https://code.google.com/p/chromium/issues/detail?id=414827
 372     return;
 373   }
 374
 375   is_multipart_ = false;
 376   current_chunk_size_ = 0;
 377   current_chunk_read_ = 0;
 378
 379   pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
 380   std::string headers;
 381   if (headers_var.is_string())
 382     headers = headers_var.AsString();
 383
 384   std::string boundary = GetMultiPartBoundary(headers);
 385   if (!boundary.empty()) {
 386     // Leave position untouched for now, when we read the data we'll get it.
 387     is_multipart_ = true;
 388     multipart_boundary_ = boundary;
 389   } else {
 390     // Need to make sure that the server returned a byte-range, since it's
 391     // possible for a server to just ignore our bye-range request and just
 392     // return the entire document even if it supports byte-range requests.
 393     // i.e. sniff response to
 394     // http://www.act.org/compass/sample/pdf/geometry.pdf
 395     current_pos_ = 0;
 396     uint32_t start_pos, end_pos;
 397     if (GetByteRange(headers, &start_pos, &end_pos)) {
 398       current_pos_ = start_pos;
 399       if (end_pos && end_pos > start_pos)
 400         current_chunk_size_ = end_pos - start_pos + 1;
 401     }
 402   }
 403
 404   ReadMore();
 405 }
 406
 407 void DocumentLoader::ReadMore() {
 408   pp::CompletionCallback callback =
 409         loader_factory_.NewCallback(&DocumentLoader::DidRead);
 410   int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
 411   if (rv != PP_OK_COMPLETIONPENDING)
 412     callback.Run(rv);
 413 }
 414
 415 void DocumentLoader::DidRead(int32_t result) {
 416   if (result > 0) {
 417     char* start = buffer_;
 418     size_t length = result;
 419     if (is_multipart_ && result > 2) {
 420       for (int i = 2; i < result; ++i) {
 421         if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
 422             (i >= 4 &&
 423              buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
 424              buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
 425           uint32_t start_pos, end_pos;
 426           if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
 427             current_pos_ = start_pos;
 428             start += i;
 429             length -= i;
 430             if (end_pos && end_pos > start_pos)
 431               current_chunk_size_ = end_pos - start_pos + 1;
 432           }
 433           break;
 434         }
 435       }
 436
 437       // Reset this flag so we don't look inside the buffer in future calls of
 438       // DidRead for this response.  Note that this code DOES NOT handle multi-
 439       // part responses with more than one part (we don't issue them at the
 440       // moment, so they shouldn't arrive).
 441       is_multipart_ = false;
 442     }
 443
 444     if (current_chunk_size_ &&
 445         current_chunk_read_ + length > current_chunk_size_)
 446       length = current_chunk_size_ - current_chunk_read_;
 447
 448     if (length) {
 449       if (document_size_ > 0) {
 450         chunk_stream_.WriteData(current_pos_, start, length);
 451       } else {
 452         // If we did not get content-length in the response, we can't
 453         // preallocate buffer for the entire document. Resizing array causing
 454         // memory fragmentation issues on the large files and OOM exceptions.
 455         // To fix this, we collect all chunks of the file to the list and
 456         // concatenate them together after request is complete.
 457         chunk_buffer_.push_back(std::vector<unsigned char>());
 458         chunk_buffer_.back().resize(length);
 459         memcpy(&(chunk_buffer_.back()[0]), start, length);
 460       }
 461       current_pos_ += length;
 462       current_chunk_read_ += length;
 463       client_->OnNewDataAvailable();
 464     }
 465     ReadMore();
 466   } else if (result == PP_OK) {
 467     ReadComplete();
 468   } else {
 469     NOTREACHED();
 470   }
 471 }
 472
 473 void DocumentLoader::ReadComplete() {
 474   if (!partial_document_) {
 475     if (document_size_ == 0) {
 476       // For the document with no 'content-length" specified we've collected all
 477       // the chunks already. Let's allocate final document buffer and copy them
 478       // over.
 479       chunk_stream_.Preallocate(current_pos_);
 480       uint32_t pos = 0;
 481       for (auto& chunk : chunk_buffer_) {
 482         chunk_stream_.WriteData(pos, &(chunk[0]), chunk.size());
 483         pos += chunk.size();
 484       }
 485       chunk_buffer_.clear();
 486     }
 487     document_size_ = current_pos_;
 488     client_->OnDocumentComplete();
 489     return;
 490   }
 491
 492   request_pending_ = false;
 493   pending_requests_.pop_front();
 494
 495   // If there are more pending request - continue downloading.
 496   if (!pending_requests_.empty()) {
 497     DownloadPendingRequests();
 498     return;
 499   }
 500
 501   if (IsDocumentComplete()) {
 502     client_->OnDocumentComplete();
 503     return;
 504   }
 505
 506   if (header_request_)
 507     client_->OnPartialDocumentLoaded();
 508   else
 509     client_->OnPendingRequestComplete();
 510   header_request_ = false;
 511
 512   // The OnPendingRequestComplete could have added more requests.
 513   if (!pending_requests_.empty()) {
 514     DownloadPendingRequests();
 515   } else {
 516     // Document is not complete and we have no outstanding requests.
 517     // Let's keep downloading PDF file in small chunks.
 518     uint32_t pos = chunk_stream_.GetFirstMissingByte();
 519     std::vector<std::pair<size_t, size_t> > ranges;
 520     chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
 521     DCHECK(!ranges.empty());
 522     RequestData(ranges[0].first, ranges[0].second);
 523   }
 524 }
 525
 526 uint32_t DocumentLoader::GetRequestSize() const {
 527   // Document loading strategy:
 528   // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
 529   // double the size (64k), and so on, until we cap max request size at 2M for
 530   // 71 or more requests.
 531   uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u);
 532   return 32 * 1024 * (1 << ((limited_count - 1) / 10u));
 533 }
 534
 535 }  // namespace chrome_pdf