Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / components / dom_distiller / core / distiller.cc
blob5be079e0ef65d5b098299cbaf5c9c4e646617dba
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/distiller.h"
7 #include <map>
8 #include <vector>
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
24 namespace {
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle = 32;
29 namespace dom_distiller {
31 DistillerFactoryImpl::DistillerFactoryImpl(
32 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
33 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
35 dom_distiller_options_(dom_distiller_options) {
38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistillerForUrl(
41 const GURL& unused) {
42 // This default implementation has the same behavior for all URLs.
43 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
44 *distiller_url_fetcher_factory_, dom_distiller_options_));
45 return distiller.Pass();
48 DistillerImpl::DistilledPageData::DistilledPageData() {}
50 DistillerImpl::DistilledPageData::~DistilledPageData() {}
52 DistillerImpl::DistillerImpl(
53 const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
54 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
56 dom_distiller_options_(dom_distiller_options),
57 max_pages_in_article_(kMaxPagesInArticle),
58 destruction_allowed_(true),
59 weak_factory_(this) {
62 DistillerImpl::~DistillerImpl() {
63 DCHECK(destruction_allowed_);
66 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
67 max_pages_in_article_ = max_num_pages;
70 bool DistillerImpl::AreAllPagesFinished() const {
71 return started_pages_index_.empty() && waiting_pages_.empty();
74 size_t DistillerImpl::TotalPageCount() const {
75 return waiting_pages_.size() + started_pages_index_.size() +
76 finished_pages_index_.size();
79 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
80 if (!IsPageNumberInUse(page_num) && url.is_valid() &&
81 TotalPageCount() < max_pages_in_article_ &&
82 seen_urls_.find(url.spec()) == seen_urls_.end()) {
83 waiting_pages_[page_num] = url;
87 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
88 return waiting_pages_.find(page_num) != waiting_pages_.end() ||
89 started_pages_index_.find(page_num) != started_pages_index_.end() ||
90 finished_pages_index_.find(page_num) != finished_pages_index_.end();
93 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
94 const {
95 DCHECK_LT(index, pages_.size());
96 DistilledPageData* page_data = pages_[index];
97 DCHECK(page_data);
98 return page_data;
101 void DistillerImpl::DistillPage(const GURL& url,
102 scoped_ptr<DistillerPage> distiller_page,
103 const DistillationFinishedCallback& finished_cb,
104 const DistillationUpdateCallback& update_cb) {
105 DCHECK(AreAllPagesFinished());
106 distiller_page_ = distiller_page.Pass();
107 finished_cb_ = finished_cb;
108 update_cb_ = update_cb;
110 AddToDistillationQueue(0, url);
111 DistillNextPage();
114 void DistillerImpl::DistillNextPage() {
115 if (!waiting_pages_.empty()) {
116 std::map<int, GURL>::iterator front = waiting_pages_.begin();
117 int page_num = front->first;
118 const GURL url = front->second;
120 waiting_pages_.erase(front);
121 DCHECK(url.is_valid());
122 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
123 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
124 seen_urls_.insert(url.spec());
125 pages_.push_back(new DistilledPageData());
126 started_pages_index_[page_num] = pages_.size() - 1;
127 distiller_page_->DistillPage(
128 url,
129 dom_distiller_options_,
130 base::Bind(&DistillerImpl::OnPageDistillationFinished,
131 weak_factory_.GetWeakPtr(),
132 page_num,
133 url));
137 void DistillerImpl::OnPageDistillationFinished(
138 int page_num,
139 const GURL& page_url,
140 scoped_ptr<proto::DomDistillerResult> distiller_result,
141 bool distillation_successful) {
142 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
143 if (distillation_successful) {
144 DCHECK(distiller_result.get());
145 DistilledPageData* page_data =
146 GetPageAtIndex(started_pages_index_[page_num]);
147 page_data->distilled_page_proto =
148 new base::RefCountedData<DistilledPageProto>();
149 page_data->page_num = page_num;
150 if (distiller_result->has_title()) {
151 page_data->distilled_page_proto->data.set_title(
152 distiller_result->title());
154 page_data->distilled_page_proto->data.set_url(page_url.spec());
155 if (distiller_result->has_distilled_content() &&
156 distiller_result->distilled_content().has_html()) {
157 page_data->distilled_page_proto->data.set_html(
158 distiller_result->distilled_content().html());
161 if (distiller_result->has_timing_info()) {
162 const proto::TimingInfo& distiller_timing_info =
163 distiller_result->timing_info();
164 DistilledPageProto::TimingInfo timing_info;
165 if (distiller_timing_info.has_markup_parsing_time()) {
166 timing_info.set_name("markup_parsing");
167 timing_info.set_time(distiller_timing_info.markup_parsing_time());
168 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
171 if (distiller_timing_info.has_document_construction_time()) {
172 timing_info.set_name("document_construction");
173 timing_info.set_time(
174 distiller_timing_info.document_construction_time());
175 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
178 if (distiller_timing_info.has_article_processing_time()) {
179 timing_info.set_name("article_processing");
180 timing_info.set_time(
181 distiller_timing_info.article_processing_time());
182 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
185 if (distiller_timing_info.has_formatting_time()) {
186 timing_info.set_name("formatting");
187 timing_info.set_time(
188 distiller_timing_info.formatting_time());
189 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
192 if (distiller_timing_info.has_total_time()) {
193 timing_info.set_name("total");
194 timing_info.set_time(
195 distiller_timing_info.total_time());
196 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
199 for (int i = 0; i < distiller_timing_info.other_times_size(); i++) {
200 timing_info.set_name(distiller_timing_info.other_times(i).name());
201 timing_info.set_time(distiller_timing_info.other_times(i).time());
202 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
206 if (distiller_result->has_debug_info() &&
207 distiller_result->debug_info().has_log()) {
208 page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
209 distiller_result->debug_info().log());
212 if (distiller_result->has_text_direction()) {
213 page_data->distilled_page_proto->data.set_text_direction(
214 distiller_result->text_direction());
215 } else {
216 page_data->distilled_page_proto->data.set_text_direction("auto");
219 if (distiller_result->has_pagination_info()) {
220 const proto::PaginationInfo& pagination_info =
221 distiller_result->pagination_info();
222 if (pagination_info.has_next_page()) {
223 GURL next_page_url(pagination_info.next_page());
224 if (next_page_url.is_valid()) {
225 // The pages should be in same origin.
226 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
227 AddToDistillationQueue(page_num + 1, next_page_url);
228 page_data->distilled_page_proto->data.mutable_pagination_info()->
229 set_next_page(next_page_url.spec());
233 if (pagination_info.has_prev_page()) {
234 GURL prev_page_url(pagination_info.prev_page());
235 if (prev_page_url.is_valid()) {
236 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
237 AddToDistillationQueue(page_num - 1, prev_page_url);
238 page_data->distilled_page_proto->data.mutable_pagination_info()->
239 set_prev_page(prev_page_url.spec());
243 if (pagination_info.has_canonical_page()) {
244 GURL canonical_page_url(pagination_info.canonical_page());
245 if (canonical_page_url.is_valid()) {
246 page_data->distilled_page_proto->data.mutable_pagination_info()->
247 set_canonical_page(canonical_page_url.spec());
252 for (int img_num = 0; img_num < distiller_result->content_images_size();
253 ++img_num) {
254 std::string image_id =
255 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
256 FetchImage(page_num, image_id,
257 distiller_result->content_images(img_num).url());
260 AddPageIfDone(page_num);
261 DistillNextPage();
262 } else {
263 started_pages_index_.erase(page_num);
264 RunDistillerCallbackIfDone();
268 void DistillerImpl::FetchImage(int page_num,
269 const std::string& image_id,
270 const std::string& image_url) {
271 if (!GURL(image_url).is_valid()) return;
272 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
273 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
274 DistillerURLFetcher* fetcher =
275 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
276 page_data->image_fetchers_.push_back(fetcher);
278 fetcher->FetchURL(image_url,
279 base::Bind(&DistillerImpl::OnFetchImageDone,
280 weak_factory_.GetWeakPtr(),
281 page_num,
282 base::Unretained(fetcher),
283 image_id,
284 image_url));
287 void DistillerImpl::OnFetchImageDone(int page_num,
288 DistillerURLFetcher* url_fetcher,
289 const std::string& id,
290 const std::string& original_url,
291 const std::string& response) {
292 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
293 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
294 DCHECK(page_data->distilled_page_proto.get());
295 DCHECK(url_fetcher);
296 ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
297 std::find(page_data->image_fetchers_.begin(),
298 page_data->image_fetchers_.end(),
299 url_fetcher);
301 DCHECK(fetcher_it != page_data->image_fetchers_.end());
302 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
303 // callback is invoked by the |url_fetcher|.
304 page_data->image_fetchers_.weak_erase(fetcher_it);
305 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
307 DistilledPageProto_Image* image =
308 page_data->distilled_page_proto->data.add_image();
309 image->set_name(id);
310 image->set_data(response);
311 image->set_url(original_url);
313 AddPageIfDone(page_num);
316 void DistillerImpl::AddPageIfDone(int page_num) {
317 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
318 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
319 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
320 if (page_data->image_fetchers_.empty()) {
321 finished_pages_index_[page_num] = started_pages_index_[page_num];
322 started_pages_index_.erase(page_num);
323 const ArticleDistillationUpdate& article_update =
324 CreateDistillationUpdate();
325 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
326 update_cb_.Run(article_update);
327 RunDistillerCallbackIfDone();
331 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
332 const {
333 bool has_prev_page = false;
334 bool has_next_page = false;
335 if (!finished_pages_index_.empty()) {
336 int prev_page_num = finished_pages_index_.begin()->first - 1;
337 int next_page_num = finished_pages_index_.rbegin()->first + 1;
338 has_prev_page = IsPageNumberInUse(prev_page_num);
339 has_next_page = IsPageNumberInUse(next_page_num);
342 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
343 update_pages;
344 for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
345 it != finished_pages_index_.end();
346 ++it) {
347 update_pages.push_back(pages_[it->second]->distilled_page_proto);
349 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
352 void DistillerImpl::RunDistillerCallbackIfDone() {
353 DCHECK(!finished_cb_.is_null());
354 if (AreAllPagesFinished()) {
355 bool first_page = true;
356 scoped_ptr<DistilledArticleProto> article_proto(
357 new DistilledArticleProto());
358 // Stitch the pages back into the article.
359 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
360 it != finished_pages_index_.end();) {
361 DistilledPageData* page_data = GetPageAtIndex(it->second);
362 *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
364 if (first_page) {
365 article_proto->set_title(page_data->distilled_page_proto->data.title());
366 first_page = false;
369 finished_pages_index_.erase(it++);
372 pages_.clear();
373 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
374 max_pages_in_article_);
376 DCHECK(pages_.empty());
377 DCHECK(finished_pages_index_.empty());
379 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
380 false);
381 finished_cb_.Run(article_proto.Pass());
382 finished_cb_.Reset();
386 } // namespace dom_distiller