1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
10 #include "base/message_loop/message_loop.h"
11 #include "base/path_service.h"
12 #include "base/run_loop.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_split.h"
15 #include "components/dom_distiller/content/distiller_page_web_contents.h"
16 #include "components/dom_distiller/core/article_entry.h"
17 #include "components/dom_distiller/core/distilled_page_prefs.h"
18 #include "components/dom_distiller/core/distiller.h"
19 #include "components/dom_distiller/core/dom_distiller_service.h"
20 #include "components/dom_distiller/core/dom_distiller_store.h"
21 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
22 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
23 #include "components/dom_distiller/core/task_tracker.h"
24 #include "components/leveldb_proto/proto_database.h"
25 #include "components/leveldb_proto/proto_database_impl.h"
26 #include "components/pref_registry/testing_pref_service_syncable.h"
27 #include "content/public/browser/browser_context.h"
28 #include "content/public/browser/browser_thread.h"
29 #include "content/public/test/content_browser_test.h"
30 #include "content/shell/browser/shell.h"
31 #include "google/protobuf/io/coded_stream.h"
32 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
33 #include "net/dns/mock_host_resolver.h"
34 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
35 #include "ui/base/resource/resource_bundle.h"
37 using content::ContentBrowserTest
;
39 namespace dom_distiller
{
43 typedef base::hash_map
<std::string
, std::string
> FileToUrlMap
;
47 // Factory for creating a Distiller that creates different DomDistillerOptions
48 // for different URLs, i.e. a specific kOriginalUrl option for each URL.
49 class TestDistillerFactoryImpl
: public DistillerFactory
{
51 TestDistillerFactoryImpl(
52 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
53 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
,
54 const FileToUrlMap
& file_to_url_map
)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
.Pass()),
56 dom_distiller_options_(dom_distiller_options
),
57 file_to_url_map_(file_to_url_map
) {
60 ~TestDistillerFactoryImpl() override
{}
62 scoped_ptr
<Distiller
> CreateDistillerForUrl(const GURL
& url
) override
{
63 dom_distiller::proto::DomDistillerOptions options
;
64 options
= dom_distiller_options_
;
65 FileToUrlMap::const_iterator it
= file_to_url_map_
.find(url
.spec());
66 if (it
!= file_to_url_map_
.end()) {
67 options
.set_original_url(it
->second
);
69 scoped_ptr
<DistillerImpl
> distiller(new DistillerImpl(
70 *distiller_url_fetcher_factory_
, options
));
71 return distiller
.Pass();
75 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory_
;
76 dom_distiller::proto::DomDistillerOptions dom_distiller_options_
;
77 FileToUrlMap file_to_url_map_
;
82 // The url to distill.
83 const char* kUrlSwitch
= "url";
85 // A space-separated list of urls to distill.
86 const char* kUrlsSwitch
= "urls";
88 // Indicates that DNS resolution should be disabled for this test.
89 const char* kDisableDnsSwitch
= "disable-dns";
91 // Will write the distilled output to the given file instead of to stdout.
92 const char* kOutputFile
= "output-file";
94 // Indicates to output a serialized protocol buffer instead of human-readable
96 const char* kShouldOutputBinary
= "output-binary";
98 // Indicates to output only the text of the article and not the enclosing html.
99 const char* kExtractTextOnly
= "extract-text-only";
101 // Indicates to include debug output.
102 const char* kDebugLevel
= "debug-level";
104 // The original URL of the page if |kUrlSwitch| is a file.
105 const char* kOriginalUrl
= "original-url";
107 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
109 const char* kOriginalUrls
= "original-urls";
111 // Maximum number of concurrent started extractor requests.
112 const int kMaxExtractorTasks
= 8;
114 scoped_ptr
<DomDistillerService
> CreateDomDistillerService(
115 content::BrowserContext
* context
,
116 const base::FilePath
& db_path
,
117 const FileToUrlMap
& file_to_url_map
) {
118 scoped_refptr
<base::SequencedTaskRunner
> background_task_runner
=
119 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
120 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
122 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
123 // temporary directory.
124 scoped_ptr
<leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
> > db(
125 new leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
>(
126 background_task_runner
));
127 scoped_ptr
<DomDistillerStore
> dom_distiller_store(
128 new DomDistillerStore(db
.Pass(), db_path
));
130 scoped_ptr
<DistillerPageFactory
> distiller_page_factory(
131 new DistillerPageWebContentsFactory(context
));
132 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory(
133 new DistillerURLFetcherFactory(context
->GetRequestContext()));
135 dom_distiller::proto::DomDistillerOptions options
;
136 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly
)) {
137 options
.set_extract_text_only(true);
140 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel
) &&
142 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
145 options
.set_debug_level(debug_level
);
147 scoped_ptr
<DistillerFactory
> distiller_factory(
148 new TestDistillerFactoryImpl(distiller_url_fetcher_factory
.Pass(),
152 // Setting up PrefService for DistilledPagePrefs.
153 user_prefs::TestingPrefServiceSyncable
* pref_service
=
154 new user_prefs::TestingPrefServiceSyncable();
155 DistilledPagePrefs::RegisterProfilePrefs(pref_service
->registry());
157 return scoped_ptr
<DomDistillerService
>(new DomDistillerService(
158 dom_distiller_store
.Pass(),
159 distiller_factory
.Pass(),
160 distiller_page_factory
.Pass(),
161 scoped_ptr
<DistilledPagePrefs
>(new DistilledPagePrefs(pref_service
))));
164 void AddComponentsTestResources() {
165 base::FilePath pak_file
;
166 base::FilePath pak_dir
;
167 PathService::Get(base::DIR_MODULE
, &pak_dir
);
169 pak_dir
.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
170 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
171 pak_file
, ui::SCALE_FACTOR_NONE
);
174 bool WriteProtobufWithSize(
175 const google::protobuf::MessageLite
& message
,
176 google::protobuf::io::ZeroCopyOutputStream
* output_stream
) {
177 google::protobuf::io::CodedOutputStream
coded_output(output_stream
);
180 const int size
= message
.ByteSize();
181 coded_output
.WriteLittleEndian32(size
);
182 message
.SerializeWithCachedSizes(&coded_output
);
183 return !coded_output
.HadError();
186 std::string
GetReadableArticleString(
187 const DistilledArticleProto
& article_proto
) {
188 std::stringstream output
;
189 output
<< "Article Title: " << article_proto
.title() << std::endl
;
190 output
<< "# of pages: " << article_proto
.pages_size() << std::endl
;
191 for (int i
= 0; i
< article_proto
.pages_size(); ++i
) {
192 if (i
> 0) output
<< std::endl
;
193 const DistilledPageProto
& page
= article_proto
.pages(i
);
194 output
<< "Page " << i
<< std::endl
;
195 output
<< "URL: " << page
.url() << std::endl
;
196 output
<< "Content: " << page
.html() << std::endl
;
197 if (page
.has_debug_info() && page
.debug_info().has_log())
198 output
<< "Log: " << page
.debug_info().log() << std::endl
;
199 if (page
.has_pagination_info()) {
200 if (page
.pagination_info().has_next_page()) {
201 output
<< "Next Page: " << page
.pagination_info().next_page()
204 if (page
.pagination_info().has_prev_page()) {
205 output
<< "Prev Page: " << page
.pagination_info().prev_page()
215 class ContentExtractionRequest
: public ViewRequestDelegate
{
217 void Start(DomDistillerService
* service
, const gfx::Size
& render_view_size
,
218 base::Closure finished_callback
) {
219 finished_callback_
= finished_callback
;
221 service
->ViewUrl(this,
222 service
->CreateDefaultDistillerPage(render_view_size
),
226 DistilledArticleProto
GetArticleCopy() {
227 return *article_proto_
;
230 static ScopedVector
<ContentExtractionRequest
> CreateForCommandLine(
231 const base::CommandLine
& command_line
,
232 FileToUrlMap
* file_to_url_map
) {
233 ScopedVector
<ContentExtractionRequest
> requests
;
234 if (command_line
.HasSwitch(kUrlSwitch
)) {
236 std::string url_string
= command_line
.GetSwitchValueASCII(kUrlSwitch
);
237 url
= GURL(url_string
);
238 if (url
.is_valid()) {
239 requests
.push_back(new ContentExtractionRequest(url
));
240 if (command_line
.HasSwitch(kOriginalUrl
)) {
241 (*file_to_url_map
)[url
.spec()] =
242 command_line
.GetSwitchValueASCII(kOriginalUrl
);
245 } else if (command_line
.HasSwitch(kUrlsSwitch
)) {
246 std::string urls_string
= command_line
.GetSwitchValueASCII(kUrlsSwitch
);
247 std::vector
<std::string
> urls
;
248 base::SplitString(urls_string
, ' ', &urls
);
249 // Check for original-urls switch, which must exactly pair up with
250 // |kUrlsSwitch| i.e. number of original urls must be same as that of
252 std::vector
<std::string
> original_urls
;
253 if (command_line
.HasSwitch(kOriginalUrls
)) {
254 std::string original_urls_string
=
255 command_line
.GetSwitchValueASCII(kOriginalUrls
);
256 base::SplitString(original_urls_string
, ' ', &original_urls
);
257 if (original_urls
.size() != urls
.size()) original_urls
.clear();
259 for (size_t i
= 0; i
< urls
.size(); ++i
) {
261 if (url
.is_valid()) {
262 requests
.push_back(new ContentExtractionRequest(url
));
263 // Only regard non-empty original urls.
264 if (!original_urls
.empty() && !original_urls
[i
].empty()) {
265 (*file_to_url_map
)[url
.spec()] = original_urls
[i
];
268 ADD_FAILURE() << "Bad url";
272 if (requests
.empty()) {
273 ADD_FAILURE() << "No valid url provided";
276 return requests
.Pass();
280 ContentExtractionRequest(const GURL
& url
) : url_(url
) {}
282 void OnArticleUpdated(ArticleDistillationUpdate article_update
) override
{}
284 void OnArticleReady(const DistilledArticleProto
* article_proto
) override
{
285 article_proto_
= article_proto
;
286 CHECK(article_proto
->pages_size()) << "Failed extracting " << url_
;
287 base::MessageLoop::current()->PostTask(
292 const DistilledArticleProto
* article_proto_
;
293 scoped_ptr
<ViewerHandle
> viewer_handle_
;
295 base::Closure finished_callback_
;
298 class ContentExtractor
: public ContentBrowserTest
{
302 max_tasks_(kMaxExtractorTasks
),
305 protobuf_output_stream_(
306 new google::protobuf::io::StringOutputStream(&output_data_
)) {}
308 // Change behavior of the default host resolver to avoid DNS lookup errors, so
309 // we can make network calls.
310 void SetUpOnMainThread() override
{
311 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch
)) {
312 EnableDNSLookupForThisTest();
314 CHECK(db_dir_
.CreateUniqueTempDir());
315 AddComponentsTestResources();
318 void TearDownOnMainThread() override
{ DisableDNSLookupForThisTest(); }
321 // Creates the DomDistillerService and creates and starts the extraction
324 const base::CommandLine
& command_line
=
325 *base::CommandLine::ForCurrentProcess();
326 FileToUrlMap file_to_url_map
;
327 requests_
= ContentExtractionRequest::CreateForCommandLine(
328 command_line
, &file_to_url_map
);
329 content::BrowserContext
* context
=
330 shell()->web_contents()->GetBrowserContext();
331 service_
= CreateDomDistillerService(context
,
338 while (pending_tasks_
< max_tasks_
&& next_request_
< requests_
.size()) {
339 requests_
[next_request_
]->Start(
341 shell()->web_contents()->GetContainerBounds().size(),
342 base::Bind(&ContentExtractor::FinishRequest
, base::Unretained(this)));
349 // Change behavior of the default host resolver to allow DNS lookup
350 // to proceed instead of being blocked by the test infrastructure.
351 void EnableDNSLookupForThisTest() {
352 // mock_host_resolver_override_ takes ownership of the resolver.
353 scoped_refptr
<net::RuleBasedHostResolverProc
> resolver
=
354 new net::RuleBasedHostResolverProc(host_resolver());
355 resolver
->AllowDirectLookup("*");
356 mock_host_resolver_override_
.reset(
357 new net::ScopedDefaultHostResolverProc(resolver
.get()));
360 // We need to reset the DNS lookup when we finish, or the test will fail.
361 void DisableDNSLookupForThisTest() {
362 mock_host_resolver_override_
.reset();
365 void FinishRequest() {
367 if (next_request_
== requests_
.size() && pending_tasks_
== 0) {
374 void DoArticleOutput() {
375 const base::CommandLine
& command_line
=
376 *base::CommandLine::ForCurrentProcess();
377 for (size_t i
= 0; i
< requests_
.size(); ++i
) {
378 const DistilledArticleProto
& article
= requests_
[i
]->GetArticleCopy();
379 if (command_line
.HasSwitch(kShouldOutputBinary
)) {
380 WriteProtobufWithSize(article
, protobuf_output_stream_
.get());
382 output_data_
+= GetReadableArticleString(article
) + "\n";
386 if (command_line
.HasSwitch(kOutputFile
)) {
387 base::FilePath filename
= command_line
.GetSwitchValuePath(kOutputFile
);
389 (int)output_data_
.size(),
390 base::WriteFile(filename
, output_data_
.c_str(), output_data_
.size()));
392 VLOG(0) << output_data_
;
400 base::MessageLoop::current()->PostTask(
401 FROM_HERE
, base::MessageLoop::QuitWhenIdleClosure());
404 size_t pending_tasks_
;
406 size_t next_request_
;
408 base::ScopedTempDir db_dir_
;
409 scoped_ptr
<net::ScopedDefaultHostResolverProc
> mock_host_resolver_override_
;
410 scoped_ptr
<DomDistillerService
> service_
;
411 ScopedVector
<ContentExtractionRequest
> requests_
;
413 std::string output_data_
;
414 scoped_ptr
<google::protobuf::io::StringOutputStream
> protobuf_output_stream_
;
417 IN_PROC_BROWSER_TEST_F(ContentExtractor
, MANUAL_ExtractUrl
) {
419 base::RunLoop().Run();
422 } // namespace dom_distiller