Disable firewall check. It takes signifficant time, need to be on FILE thread.
[chromium-blink-merge.git] / components / dom_distiller / standalone / content_extractor.cc
blob760b1658890370b6331ba432544121559d786828
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <sstream>
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h"
11 #include "base/run_loop.h"
12 #include "components/dom_distiller/content/distiller_page_web_contents.h"
13 #include "components/dom_distiller/core/distiller.h"
14 #include "components/dom_distiller/core/dom_distiller_database.h"
15 #include "components/dom_distiller/core/dom_distiller_service.h"
16 #include "components/dom_distiller/core/dom_distiller_store.h"
17 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
18 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
19 #include "components/dom_distiller/core/task_tracker.h"
20 #include "content/public/browser/browser_context.h"
21 #include "content/public/browser/browser_thread.h"
22 #include "content/public/test/content_browser_test.h"
23 #include "content/shell/browser/shell.h"
24 #include "net/dns/mock_host_resolver.h"
25 #include "ui/base/resource/resource_bundle.h"
27 using content::ContentBrowserTest;
29 namespace dom_distiller {
31 namespace {
33 // The url to distill.
34 const char* kUrlSwitch = "url";
36 // Indicates that DNS resolution should be disabled for this test.
37 const char* kDisableDnsSwitch = "disable-dns";
39 // Will write the distilled output to the given file instead of to stdout.
40 const char* kOutputFile = "output-file";
42 // Indicates to output a serialized protocol buffer instead of human-readable
43 // output.
44 const char* kShouldOutputBinary = "output-binary";
46 scoped_ptr<DomDistillerService> CreateDomDistillerService(
47 content::BrowserContext* context,
48 const base::FilePath& db_path) {
49 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
50 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
51 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
53 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
54 // temporary directory.
55 scoped_ptr<DomDistillerDatabase> db(
56 new DomDistillerDatabase(background_task_runner));
57 scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
58 db.PassAs<DomDistillerDatabaseInterface>(), db_path));
60 scoped_ptr<DistillerPageFactory> distiller_page_factory(
61 new DistillerPageWebContentsFactory(context));
62 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
63 new DistillerURLFetcherFactory(context->GetRequestContext()));
64 scoped_ptr<DistillerFactory> distiller_factory(
65 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass()));
67 return scoped_ptr<DomDistillerService>(new DomDistillerService(
68 dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
69 distiller_factory.Pass(),
70 distiller_page_factory.Pass()));
73 void AddComponentsResources() {
74 base::FilePath pak_file;
75 base::FilePath pak_dir;
76 PathService::Get(base::DIR_MODULE, &pak_dir);
77 pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
78 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
79 pak_file, ui::SCALE_FACTOR_NONE);
82 void LogArticle(const DistilledArticleProto& article_proto) {
83 std::stringstream output;
84 if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
85 output << article_proto.SerializeAsString();
86 } else {
87 output << "Article Title: " << article_proto.title() << std::endl;
88 output << "# of pages: " << article_proto.pages_size() << std::endl;
89 for (int i = 0; i < article_proto.pages_size(); ++i) {
90 const DistilledPageProto& page = article_proto.pages(i);
91 output << "Page " << i << std::endl;
92 output << "URL: " << page.url() << std::endl;
93 output << "Content: " << page.html() << std::endl;
97 std::string data = output.str();
98 if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
99 base::FilePath filename =
100 CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
101 base::WriteFile(filename, data.c_str(), data.size());
102 } else {
103 VLOG(0) << data;
107 } // namespace
109 class ContentExtractionRequest : public ViewRequestDelegate {
110 public:
111 void Start(DomDistillerService* service, base::Closure finished_callback) {
112 finished_callback_ = finished_callback;
113 viewer_handle_ =
114 service->ViewUrl(this, service->CreateDefaultDistillerPage(), url_);
117 DistilledArticleProto GetArticleCopy() {
118 return *article_proto_;
121 static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
122 const CommandLine& command_line) {
123 GURL url;
124 if (command_line.HasSwitch(kUrlSwitch)) {
125 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
126 url = GURL(url_string);
128 if (!url.is_valid()) {
129 ADD_FAILURE() << "No valid url provided";
130 return scoped_ptr<ContentExtractionRequest>();
132 return scoped_ptr<ContentExtractionRequest>(
133 new ContentExtractionRequest(url));
136 private:
137 ContentExtractionRequest(const GURL& url) : url_(url) {}
139 virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
140 OVERRIDE {}
142 virtual void OnArticleReady(const DistilledArticleProto* article_proto)
143 OVERRIDE {
144 article_proto_ = article_proto;
145 base::MessageLoop::current()->PostTask(
146 FROM_HERE,
147 finished_callback_);
150 const DistilledArticleProto* article_proto_;
151 scoped_ptr<ViewerHandle> viewer_handle_;
152 GURL url_;
153 base::Closure finished_callback_;
156 class ContentExtractor : public ContentBrowserTest {
157 // Change behavior of the default host resolver to avoid DNS lookup errors, so
158 // we can make network calls.
159 virtual void SetUpOnMainThread() OVERRIDE {
160 if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
161 EnableDNSLookupForThisTest();
163 CHECK(db_dir_.CreateUniqueTempDir());
164 AddComponentsResources();
167 virtual void TearDownOnMainThread() OVERRIDE {
168 DisableDNSLookupForThisTest();
171 protected:
172 // Creates the DomDistillerService and creates and starts the extraction
173 // request.
174 void Start() {
175 content::BrowserContext* context =
176 shell()->web_contents()->GetBrowserContext();
177 service_ = CreateDomDistillerService(context,
178 db_dir_.path());
179 const CommandLine& command_line = *CommandLine::ForCurrentProcess();
180 request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
181 request_->Start(
182 service_.get(),
183 base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
186 private:
187 // Change behavior of the default host resolver to allow DNS lookup
188 // to proceed instead of being blocked by the test infrastructure.
189 void EnableDNSLookupForThisTest() {
190 // mock_host_resolver_override_ takes ownership of the resolver.
191 scoped_refptr<net::RuleBasedHostResolverProc> resolver =
192 new net::RuleBasedHostResolverProc(host_resolver());
193 resolver->AllowDirectLookup("*");
194 mock_host_resolver_override_.reset(
195 new net::ScopedDefaultHostResolverProc(resolver.get()));
198 // We need to reset the DNS lookup when we finish, or the test will fail.
199 void DisableDNSLookupForThisTest() {
200 mock_host_resolver_override_.reset();
203 void Finish() {
204 LogArticle(request_->GetArticleCopy());
205 request_.reset();
206 service_.reset();
207 base::MessageLoop::current()->PostTask(
208 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
211 base::ScopedTempDir db_dir_;
212 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
213 scoped_ptr<DomDistillerService> service_;
214 scoped_ptr<ContentExtractionRequest> request_;
217 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
218 Start();
219 base::RunLoop().Run();
222 } // namespace dom_distiller