Roll src/third_party/WebKit eac3800:0237a66 (svn 202606:202607)
[chromium-blink-merge.git] / components / dom_distiller / standalone / content_extractor_browsertest.cc
blobde20adee08e6bcc0737d9f4a6d7522cd06ed8a6e
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <sstream>
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
10 #include "base/location.h"
11 #include "base/path_service.h"
12 #include "base/run_loop.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_split.h"
16 #include "base/thread_task_runner_handle.h"
17 #include "components/dom_distiller/content/browser/distiller_javascript_utils.h"
18 #include "components/dom_distiller/content/browser/distiller_page_web_contents.h"
19 #include "components/dom_distiller/core/article_entry.h"
20 #include "components/dom_distiller/core/distilled_page_prefs.h"
21 #include "components/dom_distiller/core/distiller.h"
22 #include "components/dom_distiller/core/dom_distiller_service.h"
23 #include "components/dom_distiller/core/dom_distiller_store.h"
24 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
25 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
26 #include "components/dom_distiller/core/task_tracker.h"
27 #include "components/leveldb_proto/proto_database.h"
28 #include "components/leveldb_proto/proto_database_impl.h"
29 #include "components/pref_registry/testing_pref_service_syncable.h"
30 #include "content/public/browser/browser_context.h"
31 #include "content/public/browser/browser_thread.h"
32 #include "content/public/common/isolated_world_ids.h"
33 #include "content/public/test/content_browser_test.h"
34 #include "content/shell/browser/shell.h"
35 #include "google/protobuf/io/coded_stream.h"
36 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
37 #include "net/dns/mock_host_resolver.h"
38 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
39 #include "ui/base/resource/resource_bundle.h"
41 using content::ContentBrowserTest;
43 namespace dom_distiller {
45 namespace {
47 typedef base::hash_map<std::string, std::string> FileToUrlMap;
51 // Factory for creating a Distiller that creates different DomDistillerOptions
52 // for different URLs, i.e. a specific kOriginalUrl option for each URL.
53 class TestDistillerFactoryImpl : public DistillerFactory {
54 public:
55 TestDistillerFactoryImpl(
56 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
57 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
58 const FileToUrlMap& file_to_url_map)
59 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
60 dom_distiller_options_(dom_distiller_options),
61 file_to_url_map_(file_to_url_map) {
64 ~TestDistillerFactoryImpl() override {}
66 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
67 dom_distiller::proto::DomDistillerOptions options;
68 options = dom_distiller_options_;
69 FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
70 if (it != file_to_url_map_.end()) {
71 options.set_original_url(it->second);
73 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
74 *distiller_url_fetcher_factory_, options));
75 return distiller.Pass();
78 private:
79 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
80 dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
81 FileToUrlMap file_to_url_map_;
84 namespace {
86 // The url to distill.
87 const char* kUrlSwitch = "url";
89 // A space-separated list of urls to distill.
90 const char* kUrlsSwitch = "urls";
92 // Indicates that DNS resolution should be disabled for this test.
93 const char* kDisableDnsSwitch = "disable-dns";
95 // Will write the distilled output to the given file instead of to stdout.
96 const char* kOutputFile = "output-file";
98 // Indicates to output a serialized protocol buffer instead of human-readable
99 // output.
100 const char* kShouldOutputBinary = "output-binary";
102 // Indicates to output only the text of the article and not the enclosing html.
103 const char* kExtractTextOnly = "extract-text-only";
105 // Indicates to include debug output.
106 const char* kDebugLevel = "debug-level";
108 // The original URL of the page if |kUrlSwitch| is a file.
109 const char* kOriginalUrl = "original-url";
111 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
112 // "kUrlsSwitch".
113 const char* kOriginalUrls = "original-urls";
115 // Maximum number of concurrent started extractor requests.
116 const int kMaxExtractorTasks = 8;
118 scoped_ptr<DomDistillerService> CreateDomDistillerService(
119 content::BrowserContext* context,
120 const base::FilePath& db_path,
121 const FileToUrlMap& file_to_url_map) {
122 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
123 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
124 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
126 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
127 // temporary directory.
128 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
129 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
130 background_task_runner));
131 scoped_ptr<DomDistillerStore> dom_distiller_store(
132 new DomDistillerStore(db.Pass(), db_path));
134 scoped_ptr<DistillerPageFactory> distiller_page_factory(
135 new DistillerPageWebContentsFactory(context));
136 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
137 new DistillerURLFetcherFactory(context->GetRequestContext()));
139 dom_distiller::proto::DomDistillerOptions options;
140 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
141 options.set_extract_text_only(true);
143 int debug_level = 0;
144 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
145 base::StringToInt(
146 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
147 kDebugLevel),
148 &debug_level)) {
149 options.set_debug_level(debug_level);
151 scoped_ptr<DistillerFactory> distiller_factory(
152 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
153 options,
154 file_to_url_map));
156 // Setting up PrefService for DistilledPagePrefs.
157 user_prefs::TestingPrefServiceSyncable* pref_service =
158 new user_prefs::TestingPrefServiceSyncable();
159 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
161 return scoped_ptr<DomDistillerService>(new DomDistillerService(
162 dom_distiller_store.Pass(),
163 distiller_factory.Pass(),
164 distiller_page_factory.Pass(),
165 scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service))));
168 void AddComponentsTestResources() {
169 base::FilePath pak_file;
170 base::FilePath pak_dir;
171 PathService::Get(base::DIR_MODULE, &pak_dir);
172 pak_file =
173 pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
174 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
175 pak_file, ui::SCALE_FACTOR_NONE);
178 bool WriteProtobufWithSize(
179 const google::protobuf::MessageLite& message,
180 google::protobuf::io::ZeroCopyOutputStream* output_stream) {
181 google::protobuf::io::CodedOutputStream coded_output(output_stream);
183 // Write the size.
184 const int size = message.ByteSize();
185 coded_output.WriteLittleEndian32(size);
186 message.SerializeWithCachedSizes(&coded_output);
187 return !coded_output.HadError();
190 std::string GetReadableArticleString(
191 const DistilledArticleProto& article_proto) {
192 std::stringstream output;
193 output << "Article Title: " << article_proto.title() << std::endl;
194 output << "# of pages: " << article_proto.pages_size() << std::endl;
195 for (int i = 0; i < article_proto.pages_size(); ++i) {
196 if (i > 0) output << std::endl;
197 const DistilledPageProto& page = article_proto.pages(i);
198 output << "Page " << i << std::endl;
199 output << "URL: " << page.url() << std::endl;
200 output << "Content: " << page.html() << std::endl;
201 if (page.has_debug_info() && page.debug_info().has_log())
202 output << "Log: " << page.debug_info().log() << std::endl;
203 if (page.has_pagination_info()) {
204 if (page.pagination_info().has_next_page()) {
205 output << "Next Page: " << page.pagination_info().next_page()
206 << std::endl;
208 if (page.pagination_info().has_prev_page()) {
209 output << "Prev Page: " << page.pagination_info().prev_page()
210 << std::endl;
214 return output.str();
217 } // namespace
219 class ContentExtractionRequest : public ViewRequestDelegate {
220 public:
221 void Start(DomDistillerService* service, const gfx::Size& render_view_size,
222 base::Closure finished_callback) {
223 finished_callback_ = finished_callback;
224 viewer_handle_ =
225 service->ViewUrl(this,
226 service->CreateDefaultDistillerPage(render_view_size),
227 url_);
230 DistilledArticleProto GetArticleCopy() {
231 return *article_proto_;
234 static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
235 const base::CommandLine& command_line,
236 FileToUrlMap* file_to_url_map) {
237 ScopedVector<ContentExtractionRequest> requests;
238 if (command_line.HasSwitch(kUrlSwitch)) {
239 GURL url;
240 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
241 url = GURL(url_string);
242 if (url.is_valid()) {
243 requests.push_back(new ContentExtractionRequest(url));
244 if (command_line.HasSwitch(kOriginalUrl)) {
245 (*file_to_url_map)[url.spec()] =
246 command_line.GetSwitchValueASCII(kOriginalUrl);
249 } else if (command_line.HasSwitch(kUrlsSwitch)) {
250 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
251 std::vector<std::string> urls = base::SplitString(
252 urls_string, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
253 // Check for original-urls switch, which must exactly pair up with
254 // |kUrlsSwitch| i.e. number of original urls must be same as that of
255 // urls.
256 std::vector<std::string> original_urls;
257 if (command_line.HasSwitch(kOriginalUrls)) {
258 std::string original_urls_string =
259 command_line.GetSwitchValueASCII(kOriginalUrls);
260 original_urls = base::SplitString(
261 original_urls_string, " ",
262 base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
263 if (original_urls.size() != urls.size())
264 original_urls.clear();
266 for (size_t i = 0; i < urls.size(); ++i) {
267 GURL url(urls[i]);
268 if (url.is_valid()) {
269 requests.push_back(new ContentExtractionRequest(url));
270 // Only regard non-empty original urls.
271 if (!original_urls.empty() && !original_urls[i].empty()) {
272 (*file_to_url_map)[url.spec()] = original_urls[i];
274 } else {
275 ADD_FAILURE() << "Bad url";
279 if (requests.empty()) {
280 ADD_FAILURE() << "No valid url provided";
283 return requests.Pass();
286 private:
287 ContentExtractionRequest(const GURL& url) : url_(url) {}
289 void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
291 void OnArticleReady(const DistilledArticleProto* article_proto) override {
292 article_proto_ = article_proto;
293 CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
294 base::ThreadTaskRunnerHandle::Get()->PostTask(FROM_HERE,
295 finished_callback_);
298 const DistilledArticleProto* article_proto_;
299 scoped_ptr<ViewerHandle> viewer_handle_;
300 GURL url_;
301 base::Closure finished_callback_;
304 class ContentExtractor : public ContentBrowserTest {
305 public:
306 ContentExtractor()
307 : pending_tasks_(0),
308 max_tasks_(kMaxExtractorTasks),
309 next_request_(0),
310 output_data_(),
311 protobuf_output_stream_(
312 new google::protobuf::io::StringOutputStream(&output_data_)) {}
314 // Change behavior of the default host resolver to avoid DNS lookup errors, so
315 // we can make network calls.
316 void SetUpOnMainThread() override {
317 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
318 EnableDNSLookupForThisTest();
320 CHECK(db_dir_.CreateUniqueTempDir());
321 AddComponentsTestResources();
324 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
326 protected:
327 // Creates the DomDistillerService and creates and starts the extraction
328 // request.
329 void Start() {
330 const base::CommandLine& command_line =
331 *base::CommandLine::ForCurrentProcess();
332 FileToUrlMap file_to_url_map;
333 requests_ = ContentExtractionRequest::CreateForCommandLine(
334 command_line, &file_to_url_map);
335 content::BrowserContext* context =
336 shell()->web_contents()->GetBrowserContext();
337 service_ = CreateDomDistillerService(context,
338 db_dir_.path(),
339 file_to_url_map);
340 PumpQueue();
343 void PumpQueue() {
344 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
345 requests_[next_request_]->Start(
346 service_.get(),
347 shell()->web_contents()->GetContainerBounds().size(),
348 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
349 ++next_request_;
350 ++pending_tasks_;
354 private:
355 // Change behavior of the default host resolver to allow DNS lookup
356 // to proceed instead of being blocked by the test infrastructure.
357 void EnableDNSLookupForThisTest() {
358 // mock_host_resolver_override_ takes ownership of the resolver.
359 scoped_refptr<net::RuleBasedHostResolverProc> resolver =
360 new net::RuleBasedHostResolverProc(host_resolver());
361 resolver->AllowDirectLookup("*");
362 mock_host_resolver_override_.reset(
363 new net::ScopedDefaultHostResolverProc(resolver.get()));
366 // We need to reset the DNS lookup when we finish, or the test will fail.
367 void DisableDNSLookupForThisTest() {
368 mock_host_resolver_override_.reset();
371 void FinishRequest() {
372 --pending_tasks_;
373 if (next_request_ == requests_.size() && pending_tasks_ == 0) {
374 Finish();
375 } else {
376 PumpQueue();
380 void DoArticleOutput() {
381 const base::CommandLine& command_line =
382 *base::CommandLine::ForCurrentProcess();
383 for (size_t i = 0; i < requests_.size(); ++i) {
384 const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
385 if (command_line.HasSwitch(kShouldOutputBinary)) {
386 WriteProtobufWithSize(article, protobuf_output_stream_.get());
387 } else {
388 output_data_ += GetReadableArticleString(article) + "\n";
392 if (command_line.HasSwitch(kOutputFile)) {
393 base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
394 ASSERT_EQ(
395 (int)output_data_.size(),
396 base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
397 } else {
398 VLOG(0) << output_data_;
402 void Finish() {
403 DoArticleOutput();
404 requests_.clear();
405 service_.reset();
406 base::ThreadTaskRunnerHandle::Get()->PostTask(
407 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
410 size_t pending_tasks_;
411 size_t max_tasks_;
412 size_t next_request_;
414 base::ScopedTempDir db_dir_;
415 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
416 scoped_ptr<DomDistillerService> service_;
417 ScopedVector<ContentExtractionRequest> requests_;
419 std::string output_data_;
420 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
423 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
424 SetDistillerJavaScriptWorldId(content::ISOLATED_WORLD_ID_CONTENT_END);
425 Start();
426 base::RunLoop().Run();
429 } // namespace dom_distiller