1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
10 #include "base/location.h"
11 #include "base/path_service.h"
12 #include "base/run_loop.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_split.h"
16 #include "base/thread_task_runner_handle.h"
17 #include "components/dom_distiller/content/distiller_page_web_contents.h"
18 #include "components/dom_distiller/core/article_entry.h"
19 #include "components/dom_distiller/core/distilled_page_prefs.h"
20 #include "components/dom_distiller/core/distiller.h"
21 #include "components/dom_distiller/core/dom_distiller_service.h"
22 #include "components/dom_distiller/core/dom_distiller_store.h"
23 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
24 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
25 #include "components/dom_distiller/core/task_tracker.h"
26 #include "components/leveldb_proto/proto_database.h"
27 #include "components/leveldb_proto/proto_database_impl.h"
28 #include "components/pref_registry/testing_pref_service_syncable.h"
29 #include "content/public/browser/browser_context.h"
30 #include "content/public/browser/browser_thread.h"
31 #include "content/public/test/content_browser_test.h"
32 #include "content/shell/browser/shell.h"
33 #include "google/protobuf/io/coded_stream.h"
34 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
35 #include "net/dns/mock_host_resolver.h"
36 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
37 #include "ui/base/resource/resource_bundle.h"
39 using content::ContentBrowserTest
;
41 namespace dom_distiller
{
45 typedef base::hash_map
<std::string
, std::string
> FileToUrlMap
;
49 // Factory for creating a Distiller that creates different DomDistillerOptions
50 // for different URLs, i.e. a specific kOriginalUrl option for each URL.
51 class TestDistillerFactoryImpl
: public DistillerFactory
{
53 TestDistillerFactoryImpl(
54 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
55 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
,
56 const FileToUrlMap
& file_to_url_map
)
57 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
.Pass()),
58 dom_distiller_options_(dom_distiller_options
),
59 file_to_url_map_(file_to_url_map
) {
62 ~TestDistillerFactoryImpl() override
{}
64 scoped_ptr
<Distiller
> CreateDistillerForUrl(const GURL
& url
) override
{
65 dom_distiller::proto::DomDistillerOptions options
;
66 options
= dom_distiller_options_
;
67 FileToUrlMap::const_iterator it
= file_to_url_map_
.find(url
.spec());
68 if (it
!= file_to_url_map_
.end()) {
69 options
.set_original_url(it
->second
);
71 scoped_ptr
<DistillerImpl
> distiller(new DistillerImpl(
72 *distiller_url_fetcher_factory_
, options
));
73 return distiller
.Pass();
77 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory_
;
78 dom_distiller::proto::DomDistillerOptions dom_distiller_options_
;
79 FileToUrlMap file_to_url_map_
;
84 // The url to distill.
85 const char* kUrlSwitch
= "url";
87 // A space-separated list of urls to distill.
88 const char* kUrlsSwitch
= "urls";
90 // Indicates that DNS resolution should be disabled for this test.
91 const char* kDisableDnsSwitch
= "disable-dns";
93 // Will write the distilled output to the given file instead of to stdout.
94 const char* kOutputFile
= "output-file";
96 // Indicates to output a serialized protocol buffer instead of human-readable
98 const char* kShouldOutputBinary
= "output-binary";
100 // Indicates to output only the text of the article and not the enclosing html.
101 const char* kExtractTextOnly
= "extract-text-only";
103 // Indicates to include debug output.
104 const char* kDebugLevel
= "debug-level";
106 // The original URL of the page if |kUrlSwitch| is a file.
107 const char* kOriginalUrl
= "original-url";
109 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
111 const char* kOriginalUrls
= "original-urls";
113 // Maximum number of concurrent started extractor requests.
114 const int kMaxExtractorTasks
= 8;
116 scoped_ptr
<DomDistillerService
> CreateDomDistillerService(
117 content::BrowserContext
* context
,
118 const base::FilePath
& db_path
,
119 const FileToUrlMap
& file_to_url_map
) {
120 scoped_refptr
<base::SequencedTaskRunner
> background_task_runner
=
121 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
122 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
124 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
125 // temporary directory.
126 scoped_ptr
<leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
> > db(
127 new leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
>(
128 background_task_runner
));
129 scoped_ptr
<DomDistillerStore
> dom_distiller_store(
130 new DomDistillerStore(db
.Pass(), db_path
));
132 scoped_ptr
<DistillerPageFactory
> distiller_page_factory(
133 new DistillerPageWebContentsFactory(context
));
134 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory(
135 new DistillerURLFetcherFactory(context
->GetRequestContext()));
137 dom_distiller::proto::DomDistillerOptions options
;
138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly
)) {
139 options
.set_extract_text_only(true);
142 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel
) &&
144 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
147 options
.set_debug_level(debug_level
);
149 scoped_ptr
<DistillerFactory
> distiller_factory(
150 new TestDistillerFactoryImpl(distiller_url_fetcher_factory
.Pass(),
154 // Setting up PrefService for DistilledPagePrefs.
155 user_prefs::TestingPrefServiceSyncable
* pref_service
=
156 new user_prefs::TestingPrefServiceSyncable();
157 DistilledPagePrefs::RegisterProfilePrefs(pref_service
->registry());
159 return scoped_ptr
<DomDistillerService
>(new DomDistillerService(
160 dom_distiller_store
.Pass(),
161 distiller_factory
.Pass(),
162 distiller_page_factory
.Pass(),
163 scoped_ptr
<DistilledPagePrefs
>(new DistilledPagePrefs(pref_service
))));
166 void AddComponentsTestResources() {
167 base::FilePath pak_file
;
168 base::FilePath pak_dir
;
169 PathService::Get(base::DIR_MODULE
, &pak_dir
);
171 pak_dir
.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
172 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
173 pak_file
, ui::SCALE_FACTOR_NONE
);
176 bool WriteProtobufWithSize(
177 const google::protobuf::MessageLite
& message
,
178 google::protobuf::io::ZeroCopyOutputStream
* output_stream
) {
179 google::protobuf::io::CodedOutputStream
coded_output(output_stream
);
182 const int size
= message
.ByteSize();
183 coded_output
.WriteLittleEndian32(size
);
184 message
.SerializeWithCachedSizes(&coded_output
);
185 return !coded_output
.HadError();
188 std::string
GetReadableArticleString(
189 const DistilledArticleProto
& article_proto
) {
190 std::stringstream output
;
191 output
<< "Article Title: " << article_proto
.title() << std::endl
;
192 output
<< "# of pages: " << article_proto
.pages_size() << std::endl
;
193 for (int i
= 0; i
< article_proto
.pages_size(); ++i
) {
194 if (i
> 0) output
<< std::endl
;
195 const DistilledPageProto
& page
= article_proto
.pages(i
);
196 output
<< "Page " << i
<< std::endl
;
197 output
<< "URL: " << page
.url() << std::endl
;
198 output
<< "Content: " << page
.html() << std::endl
;
199 if (page
.has_debug_info() && page
.debug_info().has_log())
200 output
<< "Log: " << page
.debug_info().log() << std::endl
;
201 if (page
.has_pagination_info()) {
202 if (page
.pagination_info().has_next_page()) {
203 output
<< "Next Page: " << page
.pagination_info().next_page()
206 if (page
.pagination_info().has_prev_page()) {
207 output
<< "Prev Page: " << page
.pagination_info().prev_page()
217 class ContentExtractionRequest
: public ViewRequestDelegate
{
219 void Start(DomDistillerService
* service
, const gfx::Size
& render_view_size
,
220 base::Closure finished_callback
) {
221 finished_callback_
= finished_callback
;
223 service
->ViewUrl(this,
224 service
->CreateDefaultDistillerPage(render_view_size
),
228 DistilledArticleProto
GetArticleCopy() {
229 return *article_proto_
;
232 static ScopedVector
<ContentExtractionRequest
> CreateForCommandLine(
233 const base::CommandLine
& command_line
,
234 FileToUrlMap
* file_to_url_map
) {
235 ScopedVector
<ContentExtractionRequest
> requests
;
236 if (command_line
.HasSwitch(kUrlSwitch
)) {
238 std::string url_string
= command_line
.GetSwitchValueASCII(kUrlSwitch
);
239 url
= GURL(url_string
);
240 if (url
.is_valid()) {
241 requests
.push_back(new ContentExtractionRequest(url
));
242 if (command_line
.HasSwitch(kOriginalUrl
)) {
243 (*file_to_url_map
)[url
.spec()] =
244 command_line
.GetSwitchValueASCII(kOriginalUrl
);
247 } else if (command_line
.HasSwitch(kUrlsSwitch
)) {
248 std::string urls_string
= command_line
.GetSwitchValueASCII(kUrlsSwitch
);
249 std::vector
<std::string
> urls
;
250 base::SplitString(urls_string
, ' ', &urls
);
251 // Check for original-urls switch, which must exactly pair up with
252 // |kUrlsSwitch| i.e. number of original urls must be same as that of
254 std::vector
<std::string
> original_urls
;
255 if (command_line
.HasSwitch(kOriginalUrls
)) {
256 std::string original_urls_string
=
257 command_line
.GetSwitchValueASCII(kOriginalUrls
);
258 base::SplitString(original_urls_string
, ' ', &original_urls
);
259 if (original_urls
.size() != urls
.size()) original_urls
.clear();
261 for (size_t i
= 0; i
< urls
.size(); ++i
) {
263 if (url
.is_valid()) {
264 requests
.push_back(new ContentExtractionRequest(url
));
265 // Only regard non-empty original urls.
266 if (!original_urls
.empty() && !original_urls
[i
].empty()) {
267 (*file_to_url_map
)[url
.spec()] = original_urls
[i
];
270 ADD_FAILURE() << "Bad url";
274 if (requests
.empty()) {
275 ADD_FAILURE() << "No valid url provided";
278 return requests
.Pass();
282 ContentExtractionRequest(const GURL
& url
) : url_(url
) {}
284 void OnArticleUpdated(ArticleDistillationUpdate article_update
) override
{}
286 void OnArticleReady(const DistilledArticleProto
* article_proto
) override
{
287 article_proto_
= article_proto
;
288 CHECK(article_proto
->pages_size()) << "Failed extracting " << url_
;
289 base::ThreadTaskRunnerHandle::Get()->PostTask(FROM_HERE
,
293 const DistilledArticleProto
* article_proto_
;
294 scoped_ptr
<ViewerHandle
> viewer_handle_
;
296 base::Closure finished_callback_
;
299 class ContentExtractor
: public ContentBrowserTest
{
303 max_tasks_(kMaxExtractorTasks
),
306 protobuf_output_stream_(
307 new google::protobuf::io::StringOutputStream(&output_data_
)) {}
309 // Change behavior of the default host resolver to avoid DNS lookup errors, so
310 // we can make network calls.
311 void SetUpOnMainThread() override
{
312 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch
)) {
313 EnableDNSLookupForThisTest();
315 CHECK(db_dir_
.CreateUniqueTempDir());
316 AddComponentsTestResources();
319 void TearDownOnMainThread() override
{ DisableDNSLookupForThisTest(); }
322 // Creates the DomDistillerService and creates and starts the extraction
325 const base::CommandLine
& command_line
=
326 *base::CommandLine::ForCurrentProcess();
327 FileToUrlMap file_to_url_map
;
328 requests_
= ContentExtractionRequest::CreateForCommandLine(
329 command_line
, &file_to_url_map
);
330 content::BrowserContext
* context
=
331 shell()->web_contents()->GetBrowserContext();
332 service_
= CreateDomDistillerService(context
,
339 while (pending_tasks_
< max_tasks_
&& next_request_
< requests_
.size()) {
340 requests_
[next_request_
]->Start(
342 shell()->web_contents()->GetContainerBounds().size(),
343 base::Bind(&ContentExtractor::FinishRequest
, base::Unretained(this)));
350 // Change behavior of the default host resolver to allow DNS lookup
351 // to proceed instead of being blocked by the test infrastructure.
352 void EnableDNSLookupForThisTest() {
353 // mock_host_resolver_override_ takes ownership of the resolver.
354 scoped_refptr
<net::RuleBasedHostResolverProc
> resolver
=
355 new net::RuleBasedHostResolverProc(host_resolver());
356 resolver
->AllowDirectLookup("*");
357 mock_host_resolver_override_
.reset(
358 new net::ScopedDefaultHostResolverProc(resolver
.get()));
361 // We need to reset the DNS lookup when we finish, or the test will fail.
362 void DisableDNSLookupForThisTest() {
363 mock_host_resolver_override_
.reset();
366 void FinishRequest() {
368 if (next_request_
== requests_
.size() && pending_tasks_
== 0) {
375 void DoArticleOutput() {
376 const base::CommandLine
& command_line
=
377 *base::CommandLine::ForCurrentProcess();
378 for (size_t i
= 0; i
< requests_
.size(); ++i
) {
379 const DistilledArticleProto
& article
= requests_
[i
]->GetArticleCopy();
380 if (command_line
.HasSwitch(kShouldOutputBinary
)) {
381 WriteProtobufWithSize(article
, protobuf_output_stream_
.get());
383 output_data_
+= GetReadableArticleString(article
) + "\n";
387 if (command_line
.HasSwitch(kOutputFile
)) {
388 base::FilePath filename
= command_line
.GetSwitchValuePath(kOutputFile
);
390 (int)output_data_
.size(),
391 base::WriteFile(filename
, output_data_
.c_str(), output_data_
.size()));
393 VLOG(0) << output_data_
;
401 base::ThreadTaskRunnerHandle::Get()->PostTask(
402 FROM_HERE
, base::MessageLoop::QuitWhenIdleClosure());
405 size_t pending_tasks_
;
407 size_t next_request_
;
409 base::ScopedTempDir db_dir_
;
410 scoped_ptr
<net::ScopedDefaultHostResolverProc
> mock_host_resolver_override_
;
411 scoped_ptr
<DomDistillerService
> service_
;
412 ScopedVector
<ContentExtractionRequest
> requests_
;
414 std::string output_data_
;
415 scoped_ptr
<google::protobuf::io::StringOutputStream
> protobuf_output_stream_
;
418 IN_PROC_BROWSER_TEST_F(ContentExtractor
, MANUAL_ExtractUrl
) {
420 base::RunLoop().Run();
423 } // namespace dom_distiller