1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
10 #include "base/location.h"
11 #include "base/path_service.h"
12 #include "base/run_loop.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_split.h"
16 #include "base/thread_task_runner_handle.h"
17 #include "components/dom_distiller/content/browser/distiller_javascript_utils.h"
18 #include "components/dom_distiller/content/browser/distiller_page_web_contents.h"
19 #include "components/dom_distiller/core/article_entry.h"
20 #include "components/dom_distiller/core/distilled_page_prefs.h"
21 #include "components/dom_distiller/core/distiller.h"
22 #include "components/dom_distiller/core/dom_distiller_service.h"
23 #include "components/dom_distiller/core/dom_distiller_store.h"
24 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
25 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
26 #include "components/dom_distiller/core/task_tracker.h"
27 #include "components/leveldb_proto/proto_database.h"
28 #include "components/leveldb_proto/proto_database_impl.h"
29 #include "components/pref_registry/testing_pref_service_syncable.h"
30 #include "content/public/browser/browser_context.h"
31 #include "content/public/browser/browser_thread.h"
32 #include "content/public/common/isolated_world_ids.h"
33 #include "content/public/test/content_browser_test.h"
34 #include "content/shell/browser/shell.h"
35 #include "google/protobuf/io/coded_stream.h"
36 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
37 #include "net/dns/mock_host_resolver.h"
38 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
39 #include "ui/base/resource/resource_bundle.h"
41 using content::ContentBrowserTest
;
43 namespace dom_distiller
{
47 typedef base::hash_map
<std::string
, std::string
> FileToUrlMap
;
51 // Factory for creating a Distiller that creates different DomDistillerOptions
52 // for different URLs, i.e. a specific kOriginalUrl option for each URL.
53 class TestDistillerFactoryImpl
: public DistillerFactory
{
55 TestDistillerFactoryImpl(
56 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory
,
57 const dom_distiller::proto::DomDistillerOptions
& dom_distiller_options
,
58 const FileToUrlMap
& file_to_url_map
)
59 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory
.Pass()),
60 dom_distiller_options_(dom_distiller_options
),
61 file_to_url_map_(file_to_url_map
) {
64 ~TestDistillerFactoryImpl() override
{}
66 scoped_ptr
<Distiller
> CreateDistillerForUrl(const GURL
& url
) override
{
67 dom_distiller::proto::DomDistillerOptions options
;
68 options
= dom_distiller_options_
;
69 FileToUrlMap::const_iterator it
= file_to_url_map_
.find(url
.spec());
70 if (it
!= file_to_url_map_
.end()) {
71 options
.set_original_url(it
->second
);
73 scoped_ptr
<DistillerImpl
> distiller(new DistillerImpl(
74 *distiller_url_fetcher_factory_
, options
));
75 return distiller
.Pass();
79 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory_
;
80 dom_distiller::proto::DomDistillerOptions dom_distiller_options_
;
81 FileToUrlMap file_to_url_map_
;
86 // The url to distill.
87 const char* kUrlSwitch
= "url";
89 // A space-separated list of urls to distill.
90 const char* kUrlsSwitch
= "urls";
92 // Indicates that DNS resolution should be disabled for this test.
93 const char* kDisableDnsSwitch
= "disable-dns";
95 // Will write the distilled output to the given file instead of to stdout.
96 const char* kOutputFile
= "output-file";
98 // Indicates to output a serialized protocol buffer instead of human-readable
100 const char* kShouldOutputBinary
= "output-binary";
102 // Indicates to output only the text of the article and not the enclosing html.
103 const char* kExtractTextOnly
= "extract-text-only";
105 // Indicates to include debug output.
106 const char* kDebugLevel
= "debug-level";
108 // The original URL of the page if |kUrlSwitch| is a file.
109 const char* kOriginalUrl
= "original-url";
111 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
113 const char* kOriginalUrls
= "original-urls";
115 // Maximum number of concurrent started extractor requests.
116 const int kMaxExtractorTasks
= 8;
118 scoped_ptr
<DomDistillerService
> CreateDomDistillerService(
119 content::BrowserContext
* context
,
120 const base::FilePath
& db_path
,
121 const FileToUrlMap
& file_to_url_map
) {
122 scoped_refptr
<base::SequencedTaskRunner
> background_task_runner
=
123 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
124 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
126 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
127 // temporary directory.
128 scoped_ptr
<leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
> > db(
129 new leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
>(
130 background_task_runner
));
131 scoped_ptr
<DomDistillerStore
> dom_distiller_store(
132 new DomDistillerStore(db
.Pass(), db_path
));
134 scoped_ptr
<DistillerPageFactory
> distiller_page_factory(
135 new DistillerPageWebContentsFactory(context
));
136 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory(
137 new DistillerURLFetcherFactory(context
->GetRequestContext()));
139 dom_distiller::proto::DomDistillerOptions options
;
140 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly
)) {
141 options
.set_extract_text_only(true);
144 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel
) &&
146 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
149 options
.set_debug_level(debug_level
);
151 scoped_ptr
<DistillerFactory
> distiller_factory(
152 new TestDistillerFactoryImpl(distiller_url_fetcher_factory
.Pass(),
156 // Setting up PrefService for DistilledPagePrefs.
157 user_prefs::TestingPrefServiceSyncable
* pref_service
=
158 new user_prefs::TestingPrefServiceSyncable();
159 DistilledPagePrefs::RegisterProfilePrefs(pref_service
->registry());
161 return scoped_ptr
<DomDistillerService
>(new DomDistillerService(
162 dom_distiller_store
.Pass(),
163 distiller_factory
.Pass(),
164 distiller_page_factory
.Pass(),
165 scoped_ptr
<DistilledPagePrefs
>(new DistilledPagePrefs(pref_service
))));
168 void AddComponentsTestResources() {
169 base::FilePath pak_file
;
170 base::FilePath pak_dir
;
171 PathService::Get(base::DIR_MODULE
, &pak_dir
);
173 pak_dir
.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
174 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
175 pak_file
, ui::SCALE_FACTOR_NONE
);
178 bool WriteProtobufWithSize(
179 const google::protobuf::MessageLite
& message
,
180 google::protobuf::io::ZeroCopyOutputStream
* output_stream
) {
181 google::protobuf::io::CodedOutputStream
coded_output(output_stream
);
184 const int size
= message
.ByteSize();
185 coded_output
.WriteLittleEndian32(size
);
186 message
.SerializeWithCachedSizes(&coded_output
);
187 return !coded_output
.HadError();
190 std::string
GetReadableArticleString(
191 const DistilledArticleProto
& article_proto
) {
192 std::stringstream output
;
193 output
<< "Article Title: " << article_proto
.title() << std::endl
;
194 output
<< "# of pages: " << article_proto
.pages_size() << std::endl
;
195 for (int i
= 0; i
< article_proto
.pages_size(); ++i
) {
196 if (i
> 0) output
<< std::endl
;
197 const DistilledPageProto
& page
= article_proto
.pages(i
);
198 output
<< "Page " << i
<< std::endl
;
199 output
<< "URL: " << page
.url() << std::endl
;
200 output
<< "Content: " << page
.html() << std::endl
;
201 if (page
.has_debug_info() && page
.debug_info().has_log())
202 output
<< "Log: " << page
.debug_info().log() << std::endl
;
203 if (page
.has_pagination_info()) {
204 if (page
.pagination_info().has_next_page()) {
205 output
<< "Next Page: " << page
.pagination_info().next_page()
208 if (page
.pagination_info().has_prev_page()) {
209 output
<< "Prev Page: " << page
.pagination_info().prev_page()
219 class ContentExtractionRequest
: public ViewRequestDelegate
{
221 void Start(DomDistillerService
* service
, const gfx::Size
& render_view_size
,
222 base::Closure finished_callback
) {
223 finished_callback_
= finished_callback
;
225 service
->ViewUrl(this,
226 service
->CreateDefaultDistillerPage(render_view_size
),
230 DistilledArticleProto
GetArticleCopy() {
231 return *article_proto_
;
234 static ScopedVector
<ContentExtractionRequest
> CreateForCommandLine(
235 const base::CommandLine
& command_line
,
236 FileToUrlMap
* file_to_url_map
) {
237 ScopedVector
<ContentExtractionRequest
> requests
;
238 if (command_line
.HasSwitch(kUrlSwitch
)) {
240 std::string url_string
= command_line
.GetSwitchValueASCII(kUrlSwitch
);
241 url
= GURL(url_string
);
242 if (url
.is_valid()) {
243 requests
.push_back(new ContentExtractionRequest(url
));
244 if (command_line
.HasSwitch(kOriginalUrl
)) {
245 (*file_to_url_map
)[url
.spec()] =
246 command_line
.GetSwitchValueASCII(kOriginalUrl
);
249 } else if (command_line
.HasSwitch(kUrlsSwitch
)) {
250 std::string urls_string
= command_line
.GetSwitchValueASCII(kUrlsSwitch
);
251 std::vector
<std::string
> urls
= base::SplitString(
252 urls_string
, " ", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
253 // Check for original-urls switch, which must exactly pair up with
254 // |kUrlsSwitch| i.e. number of original urls must be same as that of
256 std::vector
<std::string
> original_urls
;
257 if (command_line
.HasSwitch(kOriginalUrls
)) {
258 std::string original_urls_string
=
259 command_line
.GetSwitchValueASCII(kOriginalUrls
);
260 original_urls
= base::SplitString(
261 original_urls_string
, " ",
262 base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
263 if (original_urls
.size() != urls
.size())
264 original_urls
.clear();
266 for (size_t i
= 0; i
< urls
.size(); ++i
) {
268 if (url
.is_valid()) {
269 requests
.push_back(new ContentExtractionRequest(url
));
270 // Only regard non-empty original urls.
271 if (!original_urls
.empty() && !original_urls
[i
].empty()) {
272 (*file_to_url_map
)[url
.spec()] = original_urls
[i
];
275 ADD_FAILURE() << "Bad url";
279 if (requests
.empty()) {
280 ADD_FAILURE() << "No valid url provided";
283 return requests
.Pass();
287 ContentExtractionRequest(const GURL
& url
) : url_(url
) {}
289 void OnArticleUpdated(ArticleDistillationUpdate article_update
) override
{}
291 void OnArticleReady(const DistilledArticleProto
* article_proto
) override
{
292 article_proto_
= article_proto
;
293 CHECK(article_proto
->pages_size()) << "Failed extracting " << url_
;
294 base::ThreadTaskRunnerHandle::Get()->PostTask(FROM_HERE
,
298 const DistilledArticleProto
* article_proto_
;
299 scoped_ptr
<ViewerHandle
> viewer_handle_
;
301 base::Closure finished_callback_
;
304 class ContentExtractor
: public ContentBrowserTest
{
308 max_tasks_(kMaxExtractorTasks
),
311 protobuf_output_stream_(
312 new google::protobuf::io::StringOutputStream(&output_data_
)) {}
314 // Change behavior of the default host resolver to avoid DNS lookup errors, so
315 // we can make network calls.
316 void SetUpOnMainThread() override
{
317 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch
)) {
318 EnableDNSLookupForThisTest();
320 CHECK(db_dir_
.CreateUniqueTempDir());
321 AddComponentsTestResources();
324 void TearDownOnMainThread() override
{ DisableDNSLookupForThisTest(); }
327 // Creates the DomDistillerService and creates and starts the extraction
330 const base::CommandLine
& command_line
=
331 *base::CommandLine::ForCurrentProcess();
332 FileToUrlMap file_to_url_map
;
333 requests_
= ContentExtractionRequest::CreateForCommandLine(
334 command_line
, &file_to_url_map
);
335 content::BrowserContext
* context
=
336 shell()->web_contents()->GetBrowserContext();
337 service_
= CreateDomDistillerService(context
,
344 while (pending_tasks_
< max_tasks_
&& next_request_
< requests_
.size()) {
345 requests_
[next_request_
]->Start(
347 shell()->web_contents()->GetContainerBounds().size(),
348 base::Bind(&ContentExtractor::FinishRequest
, base::Unretained(this)));
355 // Change behavior of the default host resolver to allow DNS lookup
356 // to proceed instead of being blocked by the test infrastructure.
357 void EnableDNSLookupForThisTest() {
358 // mock_host_resolver_override_ takes ownership of the resolver.
359 scoped_refptr
<net::RuleBasedHostResolverProc
> resolver
=
360 new net::RuleBasedHostResolverProc(host_resolver());
361 resolver
->AllowDirectLookup("*");
362 mock_host_resolver_override_
.reset(
363 new net::ScopedDefaultHostResolverProc(resolver
.get()));
366 // We need to reset the DNS lookup when we finish, or the test will fail.
367 void DisableDNSLookupForThisTest() {
368 mock_host_resolver_override_
.reset();
371 void FinishRequest() {
373 if (next_request_
== requests_
.size() && pending_tasks_
== 0) {
380 void DoArticleOutput() {
381 const base::CommandLine
& command_line
=
382 *base::CommandLine::ForCurrentProcess();
383 for (size_t i
= 0; i
< requests_
.size(); ++i
) {
384 const DistilledArticleProto
& article
= requests_
[i
]->GetArticleCopy();
385 if (command_line
.HasSwitch(kShouldOutputBinary
)) {
386 WriteProtobufWithSize(article
, protobuf_output_stream_
.get());
388 output_data_
+= GetReadableArticleString(article
) + "\n";
392 if (command_line
.HasSwitch(kOutputFile
)) {
393 base::FilePath filename
= command_line
.GetSwitchValuePath(kOutputFile
);
395 (int)output_data_
.size(),
396 base::WriteFile(filename
, output_data_
.c_str(), output_data_
.size()));
398 VLOG(0) << output_data_
;
406 base::ThreadTaskRunnerHandle::Get()->PostTask(
407 FROM_HERE
, base::MessageLoop::QuitWhenIdleClosure());
410 size_t pending_tasks_
;
412 size_t next_request_
;
414 base::ScopedTempDir db_dir_
;
415 scoped_ptr
<net::ScopedDefaultHostResolverProc
> mock_host_resolver_override_
;
416 scoped_ptr
<DomDistillerService
> service_
;
417 ScopedVector
<ContentExtractionRequest
> requests_
;
419 std::string output_data_
;
420 scoped_ptr
<google::protobuf::io::StringOutputStream
> protobuf_output_stream_
;
423 IN_PROC_BROWSER_TEST_F(ContentExtractor
, MANUAL_ExtractUrl
) {
424 SetDistillerJavaScriptWorldId(content::ISOLATED_WORLD_ID_CONTENT_END
);
426 base::RunLoop().Run();
429 } // namespace dom_distiller