1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
6 #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
11 #include "base/callback.h"
12 #include "base/containers/hash_tables.h"
13 #include "base/memory/ref_counted.h"
14 #include "base/memory/scoped_ptr.h"
15 #include "base/memory/scoped_vector.h"
16 #include "base/memory/weak_ptr.h"
17 #include "components/dom_distiller/core/article_distillation_update.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "net/url_request/url_request_context_getter.h"
24 namespace dom_distiller
{
30 typedef base::Callback
<void(scoped_ptr
<DistilledArticleProto
>)>
31 DistillationFinishedCallback
;
32 typedef base::Callback
<void(const ArticleDistillationUpdate
&)>
33 DistillationUpdateCallback
;
35 virtual ~Distiller() {}
37 // Distills a page, and asynchronously returns the article HTML to the
38 // supplied |finished_cb| callback. |update_cb| is invoked whenever article
39 // under distillation is updated with more data.
40 // E.g. when distilling a 2 page article, |update_cb| may be invoked each time
41 // a distilled page is added and |finished_cb| will be invoked once
42 // distillation is completed.
43 virtual void DistillPage(const GURL
& url
,
44 scoped_ptr
<DistillerPage
> distiller_page
,
45 const DistillationFinishedCallback
& finished_cb
,
46 const DistillationUpdateCallback
& update_cb
) = 0;
49 class DistillerFactory
{
51 virtual scoped_ptr
<Distiller
> CreateDistillerForUrl(const GURL
& url
) = 0;
52 virtual ~DistillerFactory() {}
55 // Factory for creating a Distiller.
56 class DistillerFactoryImpl
: public DistillerFactory
{
59 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
60 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
);
61 ~DistillerFactoryImpl() override
;
62 scoped_ptr
<Distiller
> CreateDistillerForUrl(const GURL
& url
) override
;
65 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory_
;
66 dom_distiller::proto::DomDistillerOptions dom_distiller_options_
;
69 // Distills a article from a page and associated pages.
70 class DistillerImpl
: public Distiller
{
73 const DistillerURLFetcherFactory
& distiller_url_fetcher_factory
,
74 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
);
75 ~DistillerImpl() override
;
77 void DistillPage(const GURL
& url
,
78 scoped_ptr
<DistillerPage
> distiller_page
,
79 const DistillationFinishedCallback
& finished_cb
,
80 const DistillationUpdateCallback
& update_cb
) override
;
82 void SetMaxNumPagesInArticle(size_t max_num_pages
);
85 // In case of multiple pages, the Distiller maintains state of multiple pages
86 // as page numbers relative to the page number where distillation started.
87 // E.g. if distillation starts at page 2 for a 3 page article. The relative
88 // page numbers assigned to pages will be [-1,0,1].
90 // Class representing the state of a page under distillation.
91 struct DistilledPageData
{
93 virtual ~DistilledPageData();
94 // Relative page number of the page.
96 ScopedVector
<DistillerURLFetcher
> image_fetchers_
;
97 scoped_refptr
<base::RefCountedData
<DistilledPageProto
> >
101 DISALLOW_COPY_AND_ASSIGN(DistilledPageData
);
104 void OnFetchImageDone(int page_num
,
105 DistillerURLFetcher
* url_fetcher
,
106 const std::string
& id
,
107 const std::string
& original_url
,
108 const std::string
& response
);
110 void OnPageDistillationFinished(
112 const GURL
& page_url
,
113 scoped_ptr
<proto::DomDistillerResult
> distilled_page
,
114 bool distillation_successful
);
116 virtual void FetchImage(int page_num
,
117 const std::string
& image_id
,
118 const std::string
& image_url
);
120 // Distills the next page.
121 void DistillNextPage();
123 // Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative
124 // page number and |url| is valid. Ignores duplicate pages and urls.
125 void AddToDistillationQueue(int page_num
, const GURL
& url
);
127 // Check if |page_num| is a valid relative page number, i.e. page with
128 // |page_num| is either under distillation or has already completed
130 bool IsPageNumberInUse(int page_num
) const;
132 bool AreAllPagesFinished() const;
134 // Total number of pages in the article that the distiller knows of, this
135 // includes pages that are pending distillation.
136 size_t TotalPageCount() const;
138 // Runs |finished_cb_| if all distillation callbacks and image fetches are
140 void RunDistillerCallbackIfDone();
142 // Checks if page |distilled_page_data| has finished distillation, including
143 // all image fetches.
144 void AddPageIfDone(int page_num
);
146 DistilledPageData
* GetPageAtIndex(size_t index
) const;
148 // Create an ArticleDistillationUpdate for the current distillation
150 const ArticleDistillationUpdate
CreateDistillationUpdate() const;
152 const DistillerURLFetcherFactory
& distiller_url_fetcher_factory_
;
153 scoped_ptr
<DistillerPage
> distiller_page_
;
155 dom_distiller::proto::DomDistillerOptions dom_distiller_options_
;
156 DistillationFinishedCallback finished_cb_
;
157 DistillationUpdateCallback update_cb_
;
159 // Set of pages that are under distillation or have finished distillation.
160 // |started_pages_index_| and |finished_pages_index_| maintains the mapping
161 // from page number to the indices in |pages_|.
162 ScopedVector
<DistilledPageData
> pages_
;
164 // Maps page numbers of finished pages to the indices in |pages_|.
165 std::map
<int, size_t> finished_pages_index_
;
167 // Maps page numbers of pages under distillation to the indices in |pages_|.
168 // If a page is |started_pages_| that means it is still waiting for an action
169 // (distillation or image fetch) to finish.
170 base::hash_map
<int, size_t> started_pages_index_
;
172 // The list of pages that are still waiting for distillation to start.
173 // This is a map, to make distiller prefer distilling lower page numbers
175 std::map
<int, GURL
> waiting_pages_
;
177 // Set to keep track of which urls are already seen by the distiller. Used to
178 // prevent distiller from distilling the same url twice.
179 base::hash_set
<std::string
> seen_urls_
;
181 size_t max_pages_in_article_
;
183 bool destruction_allowed_
;
185 base::WeakPtrFactory
<DistillerImpl
> weak_factory_
;
187 DISALLOW_COPY_AND_ASSIGN(DistillerImpl
);
190 } // namespace dom_distiller
192 #endif // COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_