Update V8 to version 3.30.4 (based on bleeding_edge revision r24443)
[chromium-blink-merge.git] / components / dom_distiller / core / distiller.cc
blobfe46182e4bcf4610a88d2f0e019b85200b6b826a
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/distiller.h"
7 #include <map>
8 #include <vector>
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
24 namespace {
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle = 32;
29 namespace dom_distiller {
31 DistillerFactoryImpl::DistillerFactoryImpl(
32 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
33 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
35 dom_distiller_options_(dom_distiller_options) {
38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
41 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
42 *distiller_url_fetcher_factory_, dom_distiller_options_));
43 return distiller.PassAs<Distiller>();
46 DistillerImpl::DistilledPageData::DistilledPageData() {}
48 DistillerImpl::DistilledPageData::~DistilledPageData() {}
50 DistillerImpl::DistillerImpl(
51 const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
52 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
53 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
54 dom_distiller_options_(dom_distiller_options),
55 max_pages_in_article_(kMaxPagesInArticle),
56 destruction_allowed_(true),
57 weak_factory_(this) {
60 DistillerImpl::~DistillerImpl() {
61 DCHECK(destruction_allowed_);
64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
65 max_pages_in_article_ = max_num_pages;
68 bool DistillerImpl::AreAllPagesFinished() const {
69 return started_pages_index_.empty() && waiting_pages_.empty();
72 size_t DistillerImpl::TotalPageCount() const {
73 return waiting_pages_.size() + started_pages_index_.size() +
74 finished_pages_index_.size();
77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
78 if (!IsPageNumberInUse(page_num) && url.is_valid() &&
79 TotalPageCount() < max_pages_in_article_ &&
80 seen_urls_.find(url.spec()) == seen_urls_.end()) {
81 waiting_pages_[page_num] = url;
85 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
86 return waiting_pages_.find(page_num) != waiting_pages_.end() ||
87 started_pages_index_.find(page_num) != started_pages_index_.end() ||
88 finished_pages_index_.find(page_num) != finished_pages_index_.end();
91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
92 const {
93 DCHECK_LT(index, pages_.size());
94 DistilledPageData* page_data = pages_[index];
95 DCHECK(page_data);
96 return page_data;
99 void DistillerImpl::DistillPage(const GURL& url,
100 scoped_ptr<DistillerPage> distiller_page,
101 const DistillationFinishedCallback& finished_cb,
102 const DistillationUpdateCallback& update_cb) {
103 DCHECK(AreAllPagesFinished());
104 distiller_page_ = distiller_page.Pass();
105 finished_cb_ = finished_cb;
106 update_cb_ = update_cb;
108 AddToDistillationQueue(0, url);
109 DistillNextPage();
112 void DistillerImpl::DistillNextPage() {
113 if (!waiting_pages_.empty()) {
114 std::map<int, GURL>::iterator front = waiting_pages_.begin();
115 int page_num = front->first;
116 const GURL url = front->second;
118 waiting_pages_.erase(front);
119 DCHECK(url.is_valid());
120 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
121 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
122 seen_urls_.insert(url.spec());
123 pages_.push_back(new DistilledPageData());
124 started_pages_index_[page_num] = pages_.size() - 1;
125 distiller_page_->DistillPage(
126 url,
127 dom_distiller_options_,
128 base::Bind(&DistillerImpl::OnPageDistillationFinished,
129 weak_factory_.GetWeakPtr(),
130 page_num,
131 url));
135 void DistillerImpl::OnPageDistillationFinished(
136 int page_num,
137 const GURL& page_url,
138 scoped_ptr<proto::DomDistillerResult> distiller_result,
139 bool distillation_successful) {
140 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
141 if (distillation_successful) {
142 DCHECK(distiller_result.get());
143 DistilledPageData* page_data =
144 GetPageAtIndex(started_pages_index_[page_num]);
145 page_data->distilled_page_proto =
146 new base::RefCountedData<DistilledPageProto>();
147 page_data->page_num = page_num;
148 if (distiller_result->has_title()) {
149 page_data->distilled_page_proto->data.set_title(
150 distiller_result->title());
152 page_data->distilled_page_proto->data.set_url(page_url.spec());
153 if (distiller_result->has_distilled_content() &&
154 distiller_result->distilled_content().has_html()) {
155 page_data->distilled_page_proto->data.set_html(
156 distiller_result->distilled_content().html());
158 if (distiller_result->has_debug_info() &&
159 distiller_result->debug_info().has_log()) {
160 page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
161 distiller_result->debug_info().log());
164 if (distiller_result->has_pagination_info()) {
165 proto::PaginationInfo pagination_info =
166 distiller_result->pagination_info();
167 if (pagination_info.has_next_page()) {
168 GURL next_page_url(pagination_info.next_page());
169 if (next_page_url.is_valid()) {
170 // The pages should be in same origin.
171 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
172 AddToDistillationQueue(page_num + 1, next_page_url);
176 if (pagination_info.has_prev_page()) {
177 GURL prev_page_url(pagination_info.prev_page());
178 if (prev_page_url.is_valid()) {
179 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
180 AddToDistillationQueue(page_num - 1, prev_page_url);
185 for (int img_num = 0; img_num < distiller_result->image_urls_size();
186 ++img_num) {
187 std::string image_id =
188 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
189 FetchImage(page_num, image_id, distiller_result->image_urls(img_num));
192 AddPageIfDone(page_num);
193 DistillNextPage();
194 } else {
195 started_pages_index_.erase(page_num);
196 RunDistillerCallbackIfDone();
200 void DistillerImpl::FetchImage(int page_num,
201 const std::string& image_id,
202 const std::string& item) {
203 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
204 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
205 DistillerURLFetcher* fetcher =
206 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
207 page_data->image_fetchers_.push_back(fetcher);
209 fetcher->FetchURL(item,
210 base::Bind(&DistillerImpl::OnFetchImageDone,
211 weak_factory_.GetWeakPtr(),
212 page_num,
213 base::Unretained(fetcher),
214 image_id));
217 void DistillerImpl::OnFetchImageDone(int page_num,
218 DistillerURLFetcher* url_fetcher,
219 const std::string& id,
220 const std::string& response) {
221 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
222 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
223 DCHECK(page_data->distilled_page_proto.get());
224 DCHECK(url_fetcher);
225 ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
226 std::find(page_data->image_fetchers_.begin(),
227 page_data->image_fetchers_.end(),
228 url_fetcher);
230 DCHECK(fetcher_it != page_data->image_fetchers_.end());
231 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
232 // callback is invoked by the |url_fetcher|.
233 page_data->image_fetchers_.weak_erase(fetcher_it);
234 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
236 DistilledPageProto_Image* image =
237 page_data->distilled_page_proto->data.add_image();
238 image->set_name(id);
239 image->set_data(response);
241 AddPageIfDone(page_num);
244 void DistillerImpl::AddPageIfDone(int page_num) {
245 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
246 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
247 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
248 if (page_data->image_fetchers_.empty()) {
249 finished_pages_index_[page_num] = started_pages_index_[page_num];
250 started_pages_index_.erase(page_num);
251 const ArticleDistillationUpdate& article_update =
252 CreateDistillationUpdate();
253 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
254 update_cb_.Run(article_update);
255 RunDistillerCallbackIfDone();
259 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
260 const {
261 bool has_prev_page = false;
262 bool has_next_page = false;
263 if (!finished_pages_index_.empty()) {
264 int prev_page_num = finished_pages_index_.begin()->first - 1;
265 int next_page_num = finished_pages_index_.rbegin()->first + 1;
266 has_prev_page = IsPageNumberInUse(prev_page_num);
267 has_next_page = IsPageNumberInUse(next_page_num);
270 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
271 update_pages;
272 for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
273 it != finished_pages_index_.end();
274 ++it) {
275 update_pages.push_back(pages_[it->second]->distilled_page_proto);
277 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
280 void DistillerImpl::RunDistillerCallbackIfDone() {
281 DCHECK(!finished_cb_.is_null());
282 if (AreAllPagesFinished()) {
283 bool first_page = true;
284 scoped_ptr<DistilledArticleProto> article_proto(
285 new DistilledArticleProto());
286 // Stitch the pages back into the article.
287 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
288 it != finished_pages_index_.end();) {
289 DistilledPageData* page_data = GetPageAtIndex(it->second);
290 *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
292 if (first_page) {
293 article_proto->set_title(page_data->distilled_page_proto->data.title());
294 first_page = false;
297 finished_pages_index_.erase(it++);
300 pages_.clear();
301 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
302 max_pages_in_article_);
304 DCHECK(pages_.empty());
305 DCHECK(finished_pages_index_.empty());
307 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
308 false);
309 finished_cb_.Run(article_proto.Pass());
310 finished_cb_.Reset();
314 } // namespace dom_distiller