1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/distiller.h"
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle
= 32;
29 namespace dom_distiller
{
31 DistillerFactoryImpl::DistillerFactoryImpl(
32 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
33 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
)
34 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
.Pass()),
35 dom_distiller_options_(dom_distiller_options
) {
38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
40 scoped_ptr
<Distiller
> DistillerFactoryImpl::CreateDistiller() {
41 scoped_ptr
<DistillerImpl
> distiller(new DistillerImpl(
42 *distiller_url_fetcher_factory_
, dom_distiller_options_
));
43 return distiller
.PassAs
<Distiller
>();
46 DistillerImpl::DistilledPageData::DistilledPageData() {}
48 DistillerImpl::DistilledPageData::~DistilledPageData() {}
50 DistillerImpl::DistillerImpl(
51 const DistillerURLFetcherFactory
& distiller_url_fetcher_factory
,
52 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
)
53 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
),
54 dom_distiller_options_(dom_distiller_options
),
55 max_pages_in_article_(kMaxPagesInArticle
),
56 destruction_allowed_(true),
60 DistillerImpl::~DistillerImpl() {
61 DCHECK(destruction_allowed_
);
64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages
) {
65 max_pages_in_article_
= max_num_pages
;
68 bool DistillerImpl::AreAllPagesFinished() const {
69 return started_pages_index_
.empty() && waiting_pages_
.empty();
72 size_t DistillerImpl::TotalPageCount() const {
73 return waiting_pages_
.size() + started_pages_index_
.size() +
74 finished_pages_index_
.size();
77 void DistillerImpl::AddToDistillationQueue(int page_num
, const GURL
& url
) {
78 if (!IsPageNumberInUse(page_num
) && url
.is_valid() &&
79 TotalPageCount() < max_pages_in_article_
&&
80 seen_urls_
.find(url
.spec()) == seen_urls_
.end()) {
81 waiting_pages_
[page_num
] = url
;
85 bool DistillerImpl::IsPageNumberInUse(int page_num
) const {
86 return waiting_pages_
.find(page_num
) != waiting_pages_
.end() ||
87 started_pages_index_
.find(page_num
) != started_pages_index_
.end() ||
88 finished_pages_index_
.find(page_num
) != finished_pages_index_
.end();
91 DistillerImpl::DistilledPageData
* DistillerImpl::GetPageAtIndex(size_t index
)
93 DCHECK_LT(index
, pages_
.size());
94 DistilledPageData
* page_data
= pages_
[index
];
99 void DistillerImpl::DistillPage(const GURL
& url
,
100 scoped_ptr
<DistillerPage
> distiller_page
,
101 const DistillationFinishedCallback
& finished_cb
,
102 const DistillationUpdateCallback
& update_cb
) {
103 DCHECK(AreAllPagesFinished());
104 distiller_page_
= distiller_page
.Pass();
105 finished_cb_
= finished_cb
;
106 update_cb_
= update_cb
;
108 AddToDistillationQueue(0, url
);
112 void DistillerImpl::DistillNextPage() {
113 if (!waiting_pages_
.empty()) {
114 std::map
<int, GURL
>::iterator front
= waiting_pages_
.begin();
115 int page_num
= front
->first
;
116 const GURL url
= front
->second
;
118 waiting_pages_
.erase(front
);
119 DCHECK(url
.is_valid());
120 DCHECK(started_pages_index_
.find(page_num
) == started_pages_index_
.end());
121 DCHECK(finished_pages_index_
.find(page_num
) == finished_pages_index_
.end());
122 seen_urls_
.insert(url
.spec());
123 pages_
.push_back(new DistilledPageData());
124 started_pages_index_
[page_num
] = pages_
.size() - 1;
125 distiller_page_
->DistillPage(
127 dom_distiller_options_
,
128 base::Bind(&DistillerImpl::OnPageDistillationFinished
,
129 weak_factory_
.GetWeakPtr(),
135 void DistillerImpl::OnPageDistillationFinished(
137 const GURL
& page_url
,
138 scoped_ptr
<proto::DomDistillerResult
> distiller_result
,
139 bool distillation_successful
) {
140 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
141 if (distillation_successful
) {
142 DCHECK(distiller_result
.get());
143 DistilledPageData
* page_data
=
144 GetPageAtIndex(started_pages_index_
[page_num
]);
145 page_data
->distilled_page_proto
=
146 new base::RefCountedData
<DistilledPageProto
>();
147 page_data
->page_num
= page_num
;
148 if (distiller_result
->has_title()) {
149 page_data
->distilled_page_proto
->data
.set_title(
150 distiller_result
->title());
152 page_data
->distilled_page_proto
->data
.set_url(page_url
.spec());
153 if (distiller_result
->has_distilled_content() &&
154 distiller_result
->distilled_content().has_html()) {
155 page_data
->distilled_page_proto
->data
.set_html(
156 distiller_result
->distilled_content().html());
158 if (distiller_result
->has_debug_info() &&
159 distiller_result
->debug_info().has_log()) {
160 page_data
->distilled_page_proto
->data
.mutable_debug_info()->set_log(
161 distiller_result
->debug_info().log());
164 if (distiller_result
->has_pagination_info()) {
165 proto::PaginationInfo pagination_info
=
166 distiller_result
->pagination_info();
167 if (pagination_info
.has_next_page()) {
168 GURL
next_page_url(pagination_info
.next_page());
169 if (next_page_url
.is_valid()) {
170 // The pages should be in same origin.
171 DCHECK_EQ(next_page_url
.GetOrigin(), page_url
.GetOrigin());
172 AddToDistillationQueue(page_num
+ 1, next_page_url
);
176 if (pagination_info
.has_prev_page()) {
177 GURL
prev_page_url(pagination_info
.prev_page());
178 if (prev_page_url
.is_valid()) {
179 DCHECK_EQ(prev_page_url
.GetOrigin(), page_url
.GetOrigin());
180 AddToDistillationQueue(page_num
- 1, prev_page_url
);
185 for (int img_num
= 0; img_num
< distiller_result
->image_urls_size();
187 std::string image_id
=
188 base::IntToString(page_num
+ 1) + "_" + base::IntToString(img_num
);
189 FetchImage(page_num
, image_id
, distiller_result
->image_urls(img_num
));
192 AddPageIfDone(page_num
);
195 started_pages_index_
.erase(page_num
);
196 RunDistillerCallbackIfDone();
200 void DistillerImpl::FetchImage(int page_num
,
201 const std::string
& image_id
,
202 const std::string
& item
) {
203 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
204 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
205 DistillerURLFetcher
* fetcher
=
206 distiller_url_fetcher_factory_
.CreateDistillerURLFetcher();
207 page_data
->image_fetchers_
.push_back(fetcher
);
209 fetcher
->FetchURL(item
,
210 base::Bind(&DistillerImpl::OnFetchImageDone
,
211 weak_factory_
.GetWeakPtr(),
213 base::Unretained(fetcher
),
217 void DistillerImpl::OnFetchImageDone(int page_num
,
218 DistillerURLFetcher
* url_fetcher
,
219 const std::string
& id
,
220 const std::string
& response
) {
221 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
222 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
223 DCHECK(page_data
->distilled_page_proto
.get());
225 ScopedVector
<DistillerURLFetcher
>::iterator fetcher_it
=
226 std::find(page_data
->image_fetchers_
.begin(),
227 page_data
->image_fetchers_
.end(),
230 DCHECK(fetcher_it
!= page_data
->image_fetchers_
.end());
231 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
232 // callback is invoked by the |url_fetcher|.
233 page_data
->image_fetchers_
.weak_erase(fetcher_it
);
234 base::MessageLoop::current()->DeleteSoon(FROM_HERE
, url_fetcher
);
236 DistilledPageProto_Image
* image
=
237 page_data
->distilled_page_proto
->data
.add_image();
239 image
->set_data(response
);
241 AddPageIfDone(page_num
);
244 void DistillerImpl::AddPageIfDone(int page_num
) {
245 DCHECK(started_pages_index_
.find(page_num
) != started_pages_index_
.end());
246 DCHECK(finished_pages_index_
.find(page_num
) == finished_pages_index_
.end());
247 DistilledPageData
* page_data
= GetPageAtIndex(started_pages_index_
[page_num
]);
248 if (page_data
->image_fetchers_
.empty()) {
249 finished_pages_index_
[page_num
] = started_pages_index_
[page_num
];
250 started_pages_index_
.erase(page_num
);
251 const ArticleDistillationUpdate
& article_update
=
252 CreateDistillationUpdate();
253 DCHECK_EQ(article_update
.GetPagesSize(), finished_pages_index_
.size());
254 update_cb_
.Run(article_update
);
255 RunDistillerCallbackIfDone();
259 const ArticleDistillationUpdate
DistillerImpl::CreateDistillationUpdate()
261 bool has_prev_page
= false;
262 bool has_next_page
= false;
263 if (!finished_pages_index_
.empty()) {
264 int prev_page_num
= finished_pages_index_
.begin()->first
- 1;
265 int next_page_num
= finished_pages_index_
.rbegin()->first
+ 1;
266 has_prev_page
= IsPageNumberInUse(prev_page_num
);
267 has_next_page
= IsPageNumberInUse(next_page_num
);
270 std::vector
<scoped_refptr
<ArticleDistillationUpdate::RefCountedPageProto
> >
272 for (std::map
<int, size_t>::const_iterator it
= finished_pages_index_
.begin();
273 it
!= finished_pages_index_
.end();
275 update_pages
.push_back(pages_
[it
->second
]->distilled_page_proto
);
277 return ArticleDistillationUpdate(update_pages
, has_next_page
, has_prev_page
);
280 void DistillerImpl::RunDistillerCallbackIfDone() {
281 DCHECK(!finished_cb_
.is_null());
282 if (AreAllPagesFinished()) {
283 bool first_page
= true;
284 scoped_ptr
<DistilledArticleProto
> article_proto(
285 new DistilledArticleProto());
286 // Stitch the pages back into the article.
287 for (std::map
<int, size_t>::iterator it
= finished_pages_index_
.begin();
288 it
!= finished_pages_index_
.end();) {
289 DistilledPageData
* page_data
= GetPageAtIndex(it
->second
);
290 *(article_proto
->add_pages()) = page_data
->distilled_page_proto
->data
;
293 article_proto
->set_title(page_data
->distilled_page_proto
->data
.title());
297 finished_pages_index_
.erase(it
++);
301 DCHECK_LE(static_cast<size_t>(article_proto
->pages_size()),
302 max_pages_in_article_
);
304 DCHECK(pages_
.empty());
305 DCHECK(finished_pages_index_
.empty());
307 base::AutoReset
<bool> dont_delete_this_in_callback(&destruction_allowed_
,
309 finished_cb_
.Run(article_proto
.Pass());
310 finished_cb_
.Reset();
314 } // namespace dom_distiller