1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/distiller.h"
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle
= 32;
29 namespace dom_distiller
{
31 DistillerFactoryImpl::DistillerFactoryImpl(
32 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
33 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
)
34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
.Pass()),
35 dom_distiller_options_(dom_distiller_options
) {
38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
40 scoped_ptr
<Distiller
> DistillerFactoryImpl::CreateDistillerForUrl(
42 // This default implementation has the same behavior for all URLs.
43 scoped_ptr
<DistillerImpl
> distiller(new DistillerImpl(
44 *distiller_url_fetcher_factory_
, dom_distiller_options_
));
45 return distiller
.Pass();
48 DistillerImpl::DistilledPageData::DistilledPageData() {}
50 DistillerImpl::DistilledPageData::~DistilledPageData() {}
52 DistillerImpl::DistillerImpl(
53 const DistillerURLFetcherFactory
& distiller_url_fetcher_factory
,
54 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
),
56 dom_distiller_options_(dom_distiller_options
),
57 max_pages_in_article_(kMaxPagesInArticle
),
58 destruction_allowed_(true),
62 DistillerImpl::~DistillerImpl() {
63 DCHECK(destruction_allowed_
);
66 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages
) {
67 max_pages_in_article_
= max_num_pages
;
70 bool DistillerImpl::AreAllPagesFinished() const {
71 return started_pages_index_
.empty() && waiting_pages_
.empty();
74 size_t DistillerImpl::TotalPageCount() const {
75 return waiting_pages_
.size() + started_pages_index_
.size() +
76 finished_pages_index_
.size();
79 void DistillerImpl::AddToDistillationQueue(int page_num
, const GURL
& url
) {
80 if (!IsPageNumberInUse(page_num
) && url
.is_valid() &&
81 TotalPageCount() < max_pages_in_article_
&&
82 seen_urls_
.find(url
.spec()) == seen_urls_
.end()) {
83 waiting_pages_
[page_num
] = url
;
87 bool DistillerImpl::IsPageNumberInUse(int page_num
) const {
88 return waiting_pages_
.find(page_num
) != waiting_pages_
.end() ||
89 started_pages_index_
.find(page_num
) != started_pages_index_
.end() ||
90 finished_pages_index_
.find(page_num
) != finished_pages_index_
.end();
93 DistillerImpl::DistilledPageData
* DistillerImpl::GetPageAtIndex(size_t index
)
95 DCHECK_LT(index
, pages_
.size());
96 DistilledPageData
* page_data
= pages_
[index
];
101 void DistillerImpl::DistillPage(const GURL
& url
,
102 scoped_ptr
<DistillerPage
> distiller_page
,
103 const DistillationFinishedCallback
& finished_cb
,
104 const DistillationUpdateCallback
& update_cb
) {
105 DCHECK(AreAllPagesFinished());
106 distiller_page_
= distiller_page
.Pass();
107 finished_cb_
= finished_cb
;
108 update_cb_
= update_cb
;
110 AddToDistillationQueue(0, url
);
114 void DistillerImpl::DistillNextPage() {
115 if (!waiting_pages_
.empty()) {
116 std::map
<int, GURL
>::iterator front
= waiting_pages_
.begin();
117 int page_num
= front
->first
;
118 const GURL url
= front
->second
;
120 waiting_pages_
.erase(front
);
121 DCHECK(url
.is_valid());
122 DCHECK(started_pages_index_
.find(page_num
) == started_pages_index_
.end());
123 DCHECK(finished_pages_index_
.find(page_num
) == finished_pages_index_
.end());
124 seen_urls_
.insert(url
.spec());
125 pages_
.push_back(new DistilledPageData());
126 started_pages_index_
[page_num
] = pages_
.size() - 1;
127 distiller_page_
->DistillPage(
129 dom_distiller_options_
,
130 base::Bind(&DistillerImpl::OnPageDistillationFinished
,
131 weak_factory_
.GetWeakPtr(),
137 void DistillerImpl::OnPageDistillationFinished(
139 const GURL
& page_url
,
140 scoped_ptr
<proto::DomDistillerResult
> distiller_result
,
141 bool distillation_successful
) {
142 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
143 if (distillation_successful
) {
144 DCHECK(distiller_result
.get());
145 DistilledPageData
* page_data
=
146 GetPageAtIndex(started_pages_index_
[page_num
]);
147 page_data
->distilled_page_proto
=
148 new base::RefCountedData
<DistilledPageProto
>();
149 page_data
->page_num
= page_num
;
150 if (distiller_result
->has_title()) {
151 page_data
->distilled_page_proto
->data
.set_title(
152 distiller_result
->title());
154 page_data
->distilled_page_proto
->data
.set_url(page_url
.spec());
155 if (distiller_result
->has_distilled_content() &&
156 distiller_result
->distilled_content().has_html()) {
157 page_data
->distilled_page_proto
->data
.set_html(
158 distiller_result
->distilled_content().html());
161 if (distiller_result
->has_timing_info()) {
162 const proto::TimingInfo
& distiller_timing_info
=
163 distiller_result
->timing_info();
164 DistilledPageProto::TimingInfo timing_info
;
165 if (distiller_timing_info
.has_markup_parsing_time()) {
166 timing_info
.set_name("markup_parsing");
167 timing_info
.set_time(distiller_timing_info
.markup_parsing_time());
168 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
171 if (distiller_timing_info
.has_document_construction_time()) {
172 timing_info
.set_name("document_construction");
173 timing_info
.set_time(
174 distiller_timing_info
.document_construction_time());
175 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
178 if (distiller_timing_info
.has_article_processing_time()) {
179 timing_info
.set_name("article_processing");
180 timing_info
.set_time(
181 distiller_timing_info
.article_processing_time());
182 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
185 if (distiller_timing_info
.has_formatting_time()) {
186 timing_info
.set_name("formatting");
187 timing_info
.set_time(
188 distiller_timing_info
.formatting_time());
189 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
192 if (distiller_timing_info
.has_total_time()) {
193 timing_info
.set_name("total");
194 timing_info
.set_time(
195 distiller_timing_info
.total_time());
196 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
199 for (int i
= 0; i
< distiller_timing_info
.other_times_size(); i
++) {
200 timing_info
.set_name(distiller_timing_info
.other_times(i
).name());
201 timing_info
.set_time(distiller_timing_info
.other_times(i
).time());
202 *page_data
->distilled_page_proto
->data
.add_timing_info() = timing_info
;
206 if (distiller_result
->has_debug_info() &&
207 distiller_result
->debug_info().has_log()) {
208 page_data
->distilled_page_proto
->data
.mutable_debug_info()->set_log(
209 distiller_result
->debug_info().log());
212 if (distiller_result
->has_text_direction()) {
213 page_data
->distilled_page_proto
->data
.set_text_direction(
214 distiller_result
->text_direction());
216 page_data
->distilled_page_proto
->data
.set_text_direction("auto");
219 if (distiller_result
->has_pagination_info()) {
220 const proto::PaginationInfo
& pagination_info
=
221 distiller_result
->pagination_info();
222 if (pagination_info
.has_next_page()) {
223 GURL
next_page_url(pagination_info
.next_page());
224 if (next_page_url
.is_valid()) {
225 // The pages should be in same origin.
226 DCHECK_EQ(next_page_url
.GetOrigin(), page_url
.GetOrigin());
227 AddToDistillationQueue(page_num
+ 1, next_page_url
);
228 page_data
->distilled_page_proto
->data
.mutable_pagination_info()->
229 set_next_page(next_page_url
.spec());
233 if (pagination_info
.has_prev_page()) {
234 GURL
prev_page_url(pagination_info
.prev_page());
235 if (prev_page_url
.is_valid()) {
236 DCHECK_EQ(prev_page_url
.GetOrigin(), page_url
.GetOrigin());
237 AddToDistillationQueue(page_num
- 1, prev_page_url
);
238 page_data
->distilled_page_proto
->data
.mutable_pagination_info()->
239 set_prev_page(prev_page_url
.spec());
243 if (pagination_info
.has_canonical_page()) {
244 GURL
canonical_page_url(pagination_info
.canonical_page());
245 if (canonical_page_url
.is_valid()) {
246 page_data
->distilled_page_proto
->data
.mutable_pagination_info()->
247 set_canonical_page(canonical_page_url
.spec());
252 for (int img_num
= 0; img_num
< distiller_result
->content_images_size();
254 std::string image_id
=
255 base::IntToString(page_num
+ 1) + "_" + base::IntToString(img_num
);
256 FetchImage(page_num
, image_id
,
257 distiller_result
->content_images(img_num
).url());
260 AddPageIfDone(page_num
);
263 started_pages_index_
.erase(page_num
);
264 RunDistillerCallbackIfDone();
268 void DistillerImpl::FetchImage(int page_num
,
269 const std::string
& image_id
,
270 const std::string
& image_url
) {
271 if (!GURL(image_url
).is_valid()) return;
272 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
273 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
274 DistillerURLFetcher
* fetcher
=
275 distiller_url_fetcher_factory_
.CreateDistillerURLFetcher();
276 page_data
->image_fetchers_
.push_back(fetcher
);
278 fetcher
->FetchURL(image_url
,
279 base::Bind(&DistillerImpl::OnFetchImageDone
,
280 weak_factory_
.GetWeakPtr(),
282 base::Unretained(fetcher
),
287 void DistillerImpl::OnFetchImageDone(int page_num
,
288 DistillerURLFetcher
* url_fetcher
,
289 const std::string
& id
,
290 const std::string
& original_url
,
291 const std::string
& response
) {
292 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
293 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
294 DCHECK(page_data
->distilled_page_proto
.get());
296 ScopedVector
<DistillerURLFetcher
>::iterator fetcher_it
=
297 std::find(page_data
->image_fetchers_
.begin(),
298 page_data
->image_fetchers_
.end(),
301 DCHECK(fetcher_it
!= page_data
->image_fetchers_
.end());
302 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
303 // callback is invoked by the |url_fetcher|.
304 page_data
->image_fetchers_
.weak_erase(fetcher_it
);
305 base::MessageLoop::current()->DeleteSoon(FROM_HERE
, url_fetcher
);
307 DistilledPageProto_Image
* image
=
308 page_data
->distilled_page_proto
->data
.add_image();
310 image
->set_data(response
);
311 image
->set_url(original_url
);
313 AddPageIfDone(page_num
);
316 void DistillerImpl::AddPageIfDone(int page_num
) {
317 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
318 DCHECK(finished_pages_index_
.find(page_num
) == finished_pages_index_
.end());
319 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
320 if (page_data
->image_fetchers_
.empty()) {
321 finished_pages_index_
[page_num
] = started_pages_index_
[page_num
];
322 started_pages_index_
.erase(page_num
);
323 const ArticleDistillationUpdate
& article_update
=
324 CreateDistillationUpdate();
325 DCHECK_EQ(article_update
.GetPagesSize(), finished_pages_index_
.size());
326 update_cb_
.Run(article_update
);
327 RunDistillerCallbackIfDone();
331 const ArticleDistillationUpdate
DistillerImpl::CreateDistillationUpdate()
333 bool has_prev_page
= false;
334 bool has_next_page
= false;
335 if (!finished_pages_index_
.empty()) {
336 int prev_page_num
= finished_pages_index_
.begin()->first
- 1;
337 int next_page_num
= finished_pages_index_
.rbegin()->first
+ 1;
338 has_prev_page
= IsPageNumberInUse(prev_page_num
);
339 has_next_page
= IsPageNumberInUse(next_page_num
);
342 std::vector
<scoped_refptr
<ArticleDistillationUpdate::RefCountedPageProto
> >
344 for (std::map
<int, size_t>::const_iterator it
= finished_pages_index_
.begin();
345 it
!= finished_pages_index_
.end();
347 update_pages
.push_back(pages_
[it
->second
]->distilled_page_proto
);
349 return ArticleDistillationUpdate(update_pages
, has_next_page
, has_prev_page
);
352 void DistillerImpl::RunDistillerCallbackIfDone() {
353 DCHECK(!finished_cb_
.is_null());
354 if (AreAllPagesFinished()) {
355 bool first_page
= true;
356 scoped_ptr
<DistilledArticleProto
> article_proto(
357 new DistilledArticleProto());
358 // Stitch the pages back into the article.
359 for (std::map
<int, size_t>::iterator it
= finished_pages_index_
.begin();
360 it
!= finished_pages_index_
.end();) {
361 DistilledPageData
* page_data
= GetPageAtIndex(it
->second
);
362 *(article_proto
->add_pages()) = page_data
->distilled_page_proto
->data
;
365 article_proto
->set_title(page_data
->distilled_page_proto
->data
.title());
369 finished_pages_index_
.erase(it
++);
373 DCHECK_LE(static_cast<size_t>(article_proto
->pages_size()),
374 max_pages_in_article_
);
376 DCHECK(pages_
.empty());
377 DCHECK(finished_pages_index_
.empty());
379 base::AutoReset
<bool> dont_delete_this_in_callback(&destruction_allowed_
,
381 finished_cb_
.Run(article_proto
.Pass());
382 finished_cb_
.Reset();
386 } // namespace dom_distiller