1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h"
11 #include "base/run_loop.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_split.h"
14 #include "components/dom_distiller/content/distiller_page_web_contents.h"
15 #include "components/dom_distiller/core/article_entry.h"
16 #include "components/dom_distiller/core/distilled_page_prefs.h"
17 #include "components/dom_distiller/core/distiller.h"
18 #include "components/dom_distiller/core/dom_distiller_service.h"
19 #include "components/dom_distiller/core/dom_distiller_store.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "components/dom_distiller/core/task_tracker.h"
23 #include "components/leveldb_proto/proto_database.h"
24 #include "components/leveldb_proto/proto_database_impl.h"
25 #include "components/pref_registry/testing_pref_service_syncable.h"
26 #include "content/public/browser/browser_context.h"
27 #include "content/public/browser/browser_thread.h"
28 #include "content/public/test/content_browser_test.h"
29 #include "content/shell/browser/shell.h"
30 #include "google/protobuf/io/coded_stream.h"
31 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
32 #include "net/dns/mock_host_resolver.h"
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
34 #include "ui/base/resource/resource_bundle.h"
36 using content::ContentBrowserTest
;
38 namespace dom_distiller
{
42 // The url to distill.
43 const char* kUrlSwitch
= "url";
45 // A space-separated list of urls to distill.
46 const char* kUrlsSwitch
= "urls";
48 // Indicates that DNS resolution should be disabled for this test.
49 const char* kDisableDnsSwitch
= "disable-dns";
51 // Will write the distilled output to the given file instead of to stdout.
52 const char* kOutputFile
= "output-file";
54 // Indicates to output a serialized protocol buffer instead of human-readable
56 const char* kShouldOutputBinary
= "output-binary";
58 // Indicates to output only the text of the article and not the enclosing html.
59 const char* kExtractTextOnly
= "extract-text-only";
61 // Indicates to include debug output.
62 const char* kDebugLevel
= "debug-level";
64 // Maximum number of concurrent started extractor requests.
65 const int kMaxExtractorTasks
= 8;
67 scoped_ptr
<DomDistillerService
> CreateDomDistillerService(
68 content::BrowserContext
* context
,
69 const base::FilePath
& db_path
) {
70 scoped_refptr
<base::SequencedTaskRunner
> background_task_runner
=
71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
72 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
75 // temporary directory.
76 scoped_ptr
<leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
> > db(
77 new leveldb_proto::ProtoDatabaseImpl
<ArticleEntry
>(
78 background_task_runner
));
79 scoped_ptr
<DomDistillerStore
> dom_distiller_store(new DomDistillerStore(
80 db
.PassAs
<leveldb_proto::ProtoDatabase
<ArticleEntry
> >(), db_path
));
82 scoped_ptr
<DistillerPageFactory
> distiller_page_factory(
83 new DistillerPageWebContentsFactory(context
));
84 scoped_ptr
<DistillerURLFetcherFactory
> distiller_url_fetcher_factory(
85 new DistillerURLFetcherFactory(context
->GetRequestContext()));
87 dom_distiller::proto::DomDistillerOptions options
;
88 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly
)) {
89 options
.set_extract_text_only(true);
92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel
) &&
94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
97 options
.set_debug_level(debug_level
);
99 scoped_ptr
<DistillerFactory
> distiller_factory(
100 new DistillerFactoryImpl(distiller_url_fetcher_factory
.Pass(), options
));
102 // Setting up PrefService for DistilledPagePrefs.
103 user_prefs::TestingPrefServiceSyncable
* pref_service
=
104 new user_prefs::TestingPrefServiceSyncable();
105 DistilledPagePrefs::RegisterProfilePrefs(pref_service
->registry());
107 return scoped_ptr
<DomDistillerService
>(new DomDistillerService(
108 dom_distiller_store
.PassAs
<DomDistillerStoreInterface
>(),
109 distiller_factory
.Pass(),
110 distiller_page_factory
.Pass(),
111 scoped_ptr
<DistilledPagePrefs
>(
112 new DistilledPagePrefs(pref_service
))));
115 void AddComponentsResources() {
116 base::FilePath pak_file
;
117 base::FilePath pak_dir
;
118 PathService::Get(base::DIR_MODULE
, &pak_dir
);
119 pak_file
= pak_dir
.Append(FILE_PATH_LITERAL("components_resources.pak"));
120 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
121 pak_file
, ui::SCALE_FACTOR_NONE
);
124 bool WriteProtobufWithSize(
125 const google::protobuf::MessageLite
& message
,
126 google::protobuf::io::ZeroCopyOutputStream
* output_stream
) {
127 google::protobuf::io::CodedOutputStream
coded_output(output_stream
);
130 const int size
= message
.ByteSize();
131 coded_output
.WriteLittleEndian32(size
);
132 message
.SerializeWithCachedSizes(&coded_output
);
133 return !coded_output
.HadError();
136 std::string
GetReadableArticleString(
137 const DistilledArticleProto
& article_proto
) {
138 std::stringstream output
;
139 output
<< "Article Title: " << article_proto
.title() << std::endl
;
140 output
<< "# of pages: " << article_proto
.pages_size() << std::endl
;
141 for (int i
= 0; i
< article_proto
.pages_size(); ++i
) {
142 const DistilledPageProto
& page
= article_proto
.pages(i
);
143 output
<< "Page " << i
<< std::endl
;
144 output
<< "URL: " << page
.url() << std::endl
;
145 output
<< "Content: " << page
.html() << std::endl
;
152 class ContentExtractionRequest
: public ViewRequestDelegate
{
154 void Start(DomDistillerService
* service
, const gfx::Size
& render_view_size
,
155 base::Closure finished_callback
) {
156 finished_callback_
= finished_callback
;
158 service
->ViewUrl(this,
159 service
->CreateDefaultDistillerPage(render_view_size
),
163 DistilledArticleProto
GetArticleCopy() {
164 return *article_proto_
;
167 static ScopedVector
<ContentExtractionRequest
> CreateForCommandLine(
168 const CommandLine
& command_line
) {
169 ScopedVector
<ContentExtractionRequest
> requests
;
170 if (command_line
.HasSwitch(kUrlSwitch
)) {
172 std::string url_string
= command_line
.GetSwitchValueASCII(kUrlSwitch
);
173 url
= GURL(url_string
);
174 if (url
.is_valid()) {
175 requests
.push_back(new ContentExtractionRequest(url
));
177 } else if (command_line
.HasSwitch(kUrlsSwitch
)) {
178 std::string urls_string
= command_line
.GetSwitchValueASCII(kUrlsSwitch
);
179 std::vector
<std::string
> urls
;
180 base::SplitString(urls_string
, ' ', &urls
);
181 for (size_t i
= 0; i
< urls
.size(); ++i
) {
183 if (url
.is_valid()) {
184 requests
.push_back(new ContentExtractionRequest(url
));
186 ADD_FAILURE() << "Bad url";
190 if (requests
.empty()) {
191 ADD_FAILURE() << "No valid url provided";
194 return requests
.Pass();
198 ContentExtractionRequest(const GURL
& url
) : url_(url
) {}
200 virtual void OnArticleUpdated(ArticleDistillationUpdate article_update
)
203 virtual void OnArticleReady(const DistilledArticleProto
* article_proto
)
205 article_proto_
= article_proto
;
206 base::MessageLoop::current()->PostTask(
211 const DistilledArticleProto
* article_proto_
;
212 scoped_ptr
<ViewerHandle
> viewer_handle_
;
214 base::Closure finished_callback_
;
217 class ContentExtractor
: public ContentBrowserTest
{
221 max_tasks_(kMaxExtractorTasks
),
224 protobuf_output_stream_(
225 new google::protobuf::io::StringOutputStream(&output_data_
)) {}
227 // Change behavior of the default host resolver to avoid DNS lookup errors, so
228 // we can make network calls.
229 virtual void SetUpOnMainThread() OVERRIDE
{
230 if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch
)) {
231 EnableDNSLookupForThisTest();
233 CHECK(db_dir_
.CreateUniqueTempDir());
234 AddComponentsResources();
237 virtual void TearDownOnMainThread() OVERRIDE
{
238 DisableDNSLookupForThisTest();
242 // Creates the DomDistillerService and creates and starts the extraction
245 content::BrowserContext
* context
=
246 shell()->web_contents()->GetBrowserContext();
247 service_
= CreateDomDistillerService(context
,
249 const CommandLine
& command_line
= *CommandLine::ForCurrentProcess();
250 requests_
= ContentExtractionRequest::CreateForCommandLine(command_line
);
255 while (pending_tasks_
< max_tasks_
&& next_request_
< requests_
.size()) {
256 requests_
[next_request_
]->Start(
258 shell()->web_contents()->GetContainerBounds().size(),
259 base::Bind(&ContentExtractor::FinishRequest
, base::Unretained(this)));
266 // Change behavior of the default host resolver to allow DNS lookup
267 // to proceed instead of being blocked by the test infrastructure.
268 void EnableDNSLookupForThisTest() {
269 // mock_host_resolver_override_ takes ownership of the resolver.
270 scoped_refptr
<net::RuleBasedHostResolverProc
> resolver
=
271 new net::RuleBasedHostResolverProc(host_resolver());
272 resolver
->AllowDirectLookup("*");
273 mock_host_resolver_override_
.reset(
274 new net::ScopedDefaultHostResolverProc(resolver
.get()));
277 // We need to reset the DNS lookup when we finish, or the test will fail.
278 void DisableDNSLookupForThisTest() {
279 mock_host_resolver_override_
.reset();
282 void FinishRequest() {
284 if (next_request_
== requests_
.size() && pending_tasks_
== 0) {
291 void DoArticleOutput() {
292 for (size_t i
= 0; i
< requests_
.size(); ++i
) {
293 const DistilledArticleProto
& article
= requests_
[i
]->GetArticleCopy();
294 if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary
)) {
295 WriteProtobufWithSize(article
, protobuf_output_stream_
.get());
297 output_data_
+= GetReadableArticleString(article
) + "\n";
301 if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile
)) {
302 base::FilePath filename
=
303 CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile
);
305 (int)output_data_
.size(),
306 base::WriteFile(filename
, output_data_
.c_str(), output_data_
.size()));
308 VLOG(0) << output_data_
;
316 base::MessageLoop::current()->PostTask(
317 FROM_HERE
, base::MessageLoop::QuitWhenIdleClosure());
320 size_t pending_tasks_
;
322 size_t next_request_
;
324 base::ScopedTempDir db_dir_
;
325 scoped_ptr
<net::ScopedDefaultHostResolverProc
> mock_host_resolver_override_
;
326 scoped_ptr
<DomDistillerService
> service_
;
327 ScopedVector
<ContentExtractionRequest
> requests_
;
329 std::string output_data_
;
330 scoped_ptr
<google::protobuf::io::StringOutputStream
> protobuf_output_stream_
;
333 IN_PROC_BROWSER_TEST_F(ContentExtractor
, MANUAL_ExtractUrl
) {
335 base::RunLoop().Run();
338 } // namespace dom_distiller