Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / content / renderer / dom_serializer_browsertest.cc
blob5f531002d56f80ddccd0cc2a22a519b46710eca5
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/bind.h"
6 #include "base/command_line.h"
7 #include "base/compiler_specific.h"
8 #include "base/containers/hash_tables.h"
9 #include "base/files/file_path.h"
10 #include "base/files/file_util.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "content/public/browser/render_view_host.h"
14 #include "content/public/browser/web_contents.h"
15 #include "content/public/common/content_switches.h"
16 #include "content/public/renderer/render_view.h"
17 #include "content/public/renderer/render_view_observer.h"
18 #include "content/public/test/content_browser_test.h"
19 #include "content/public/test/content_browser_test_utils.h"
20 #include "content/public/test/test_utils.h"
21 #include "content/renderer/savable_resources.h"
22 #include "content/shell/browser/shell.h"
23 #include "net/base/filename_util.h"
24 #include "net/url_request/url_request_context.h"
25 #include "third_party/WebKit/public/platform/WebCString.h"
26 #include "third_party/WebKit/public/platform/WebData.h"
27 #include "third_party/WebKit/public/platform/WebString.h"
28 #include "third_party/WebKit/public/platform/WebURL.h"
29 #include "third_party/WebKit/public/platform/WebVector.h"
30 #include "third_party/WebKit/public/web/WebDocument.h"
31 #include "third_party/WebKit/public/web/WebDocumentType.h"
32 #include "third_party/WebKit/public/web/WebElement.h"
33 #include "third_party/WebKit/public/web/WebElementCollection.h"
34 #include "third_party/WebKit/public/web/WebLocalFrame.h"
35 #include "third_party/WebKit/public/web/WebNode.h"
36 #include "third_party/WebKit/public/web/WebNodeList.h"
37 #include "third_party/WebKit/public/web/WebPageSerializer.h"
38 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
39 #include "third_party/WebKit/public/web/WebView.h"
41 using blink::WebCString;
42 using blink::WebData;
43 using blink::WebDocument;
44 using blink::WebElement;
45 using blink::WebElementCollection;
46 using blink::WebFrame;
47 using blink::WebLocalFrame;
48 using blink::WebNode;
49 using blink::WebNodeList;
50 using blink::WebPageSerializer;
51 using blink::WebPageSerializerClient;
52 using blink::WebString;
53 using blink::WebURL;
54 using blink::WebView;
55 using blink::WebVector;
57 namespace content {
59 // Iterate recursively over sub-frames to find one with with a given url.
60 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
61 if (!web_view->mainFrame())
62 return NULL;
64 std::vector<WebFrame*> stack;
65 stack.push_back(web_view->mainFrame());
67 while (!stack.empty()) {
68 WebFrame* current_frame = stack.back();
69 stack.pop_back();
70 if (GURL(current_frame->document().url()) == url)
71 return current_frame;
72 WebElementCollection all = current_frame->document().all();
73 for (WebElement element = all.firstItem();
74 !element.isNull(); element = all.nextItem()) {
75 // Check frame tag and iframe tag
76 if (!element.hasHTMLTagName("frame") && !element.hasHTMLTagName("iframe"))
77 continue;
78 WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
79 if (sub_frame)
80 stack.push_back(sub_frame);
83 return NULL;
86 bool HasDocType(const WebDocument& doc) {
87 return !doc.doctype().isNull();
90 // Helper function for checking whether input node is META tag. Return true
91 // means it is META element, otherwise return false. The parameter charset_info
92 // return actual charset info if the META tag has charset declaration.
93 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
94 if (!node.isElementNode())
95 return false;
96 const WebElement meta = node.toConst<WebElement>();
97 if (!meta.hasHTMLTagName("meta"))
98 return false;
99 charset_info.erase(0, charset_info.length());
100 // Check the META charset declaration.
101 WebString httpEquiv = meta.getAttribute("http-equiv");
102 if (base::LowerCaseEqualsASCII(base::StringPiece16(httpEquiv),
103 "content-type")) {
104 std::string content = meta.getAttribute("content").utf8();
105 int pos = content.find("charset", 0);
106 if (pos > -1) {
107 // Add a dummy charset declaration to charset_info, which indicates this
108 // META tag has charset declaration although we do not get correct value
109 // yet.
110 charset_info.append("has-charset-declaration");
111 int remaining_length = content.length() - pos - 7;
112 if (!remaining_length)
113 return true;
114 int start_pos = pos + 7;
115 // Find "=" symbol.
116 while (remaining_length--)
117 if (content[start_pos++] == L'=')
118 break;
119 // Skip beginning space.
120 while (remaining_length) {
121 if (content[start_pos] > 0x0020)
122 break;
123 ++start_pos;
124 --remaining_length;
126 if (!remaining_length)
127 return true;
128 int end_pos = start_pos;
129 // Now we find out the start point of charset info. Search the end point.
130 while (remaining_length--) {
131 if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
132 break;
133 ++end_pos;
135 // Get actual charset info.
136 charset_info = content.substr(start_pos, end_pos - start_pos);
137 return true;
140 return true;
143 class LoadObserver : public RenderViewObserver {
144 public:
145 LoadObserver(RenderView* render_view, const base::Closure& quit_closure)
146 : RenderViewObserver(render_view),
147 quit_closure_(quit_closure) {}
149 void DidFinishLoad(blink::WebLocalFrame* frame) override {
150 if (frame == render_view()->GetWebView()->mainFrame())
151 quit_closure_.Run();
154 private:
155 base::Closure quit_closure_;
158 class DomSerializerTests : public ContentBrowserTest,
159 public WebPageSerializerClient {
160 public:
161 DomSerializerTests()
162 : serialized_(false),
163 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
165 void SetUpCommandLine(base::CommandLine* command_line) override {
166 command_line->AppendSwitch(switches::kSingleProcess);
167 #if defined(OS_WIN)
168 // Don't want to try to create a GPU process.
169 command_line->AppendSwitch(switches::kDisableGpu);
170 #endif
173 void SetUpOnMainThread() override {
174 render_view_routing_id_ =
175 shell()->web_contents()->GetRenderViewHost()->GetRoutingID();
178 // DomSerializerDelegate.
179 virtual void didSerializeDataForFrame(const WebURL& frame_web_url,
180 const WebCString& data,
181 PageSerializationStatus status) {
183 GURL frame_url(frame_web_url);
184 // If the all frames are finished saving, check all finish status
185 if (status == WebPageSerializerClient::AllFramesAreFinished) {
186 SerializationFinishStatusMap::iterator it =
187 serialization_finish_status_.begin();
188 for (; it != serialization_finish_status_.end(); ++it)
189 ASSERT_TRUE(it->second);
190 serialized_ = true;
191 return;
194 // Check finish status of current frame.
195 SerializationFinishStatusMap::iterator it =
196 serialization_finish_status_.find(frame_url.spec());
197 // New frame, set initial status as false.
198 if (it == serialization_finish_status_.end())
199 serialization_finish_status_[frame_url.spec()] = false;
201 it = serialization_finish_status_.find(frame_url.spec());
202 ASSERT_TRUE(it != serialization_finish_status_.end());
203 // In process frame, finish status should be false.
204 ASSERT_FALSE(it->second);
206 // Add data to corresponding frame's content.
207 serialized_frame_map_[frame_url.spec()] += data.data();
209 // Current frame is completed saving, change the finish status.
210 if (status == WebPageSerializerClient::CurrentFrameIsFinished)
211 it->second = true;
214 bool HasSerializedFrame(const GURL& frame_url) {
215 return serialized_frame_map_.find(frame_url.spec()) !=
216 serialized_frame_map_.end();
219 const std::string& GetSerializedContentForFrame(
220 const GURL& frame_url) {
221 return serialized_frame_map_[frame_url.spec()];
224 RenderView* GetRenderView() {
225 return RenderView::FromRoutingID(render_view_routing_id_);
228 WebView* GetWebView() {
229 return GetRenderView()->GetWebView();
232 WebFrame* GetMainFrame() {
233 return GetWebView()->mainFrame();
236 // Load web page according to input content and relative URLs within
237 // the document.
238 void LoadContents(const std::string& contents,
239 const GURL& base_url,
240 const WebString encoding_info) {
241 scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner;
242 LoadObserver observer(GetRenderView(), runner->QuitClosure());
244 // If input encoding is empty, use UTF-8 as default encoding.
245 if (encoding_info.isEmpty()) {
246 GetMainFrame()->loadHTMLString(contents, base_url);
247 } else {
248 WebData data(contents.data(), contents.length());
250 // Do not use WebFrame.LoadHTMLString because it assumes that input
251 // html contents use UTF-8 encoding.
252 // TODO(darin): This should use WebFrame::loadData.
253 WebFrame* web_frame = GetMainFrame();
255 ASSERT_TRUE(web_frame != NULL);
257 web_frame->loadData(data, "text/html", encoding_info, base_url);
260 runner->Run();
263 // Serialize page DOM according to specific page URL. The parameter
264 // recursive_serialization indicates whether we will serialize all
265 // sub-frames.
266 void SerializeDomForURL(const GURL& page_url,
267 bool recursive_serialization) {
268 // Find corresponding WebFrame according to page_url.
269 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url);
270 ASSERT_TRUE(web_frame != NULL);
271 WebVector<WebURL> links;
272 links.assign(&page_url, 1);
273 WebString file_path =
274 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
275 WebVector<WebString> local_paths;
276 local_paths.assign(&file_path, 1);
277 // Start serializing DOM.
278 bool result = WebPageSerializer::serialize(web_frame->toWebLocalFrame(),
279 recursive_serialization,
280 static_cast<WebPageSerializerClient*>(this),
281 links,
282 local_paths,
283 local_directory_name_.AsUTF16Unsafe());
284 ASSERT_TRUE(result);
285 ASSERT_TRUE(serialized_);
288 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) {
289 // Make sure original contents have document type.
290 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
291 ASSERT_TRUE(web_frame != NULL);
292 WebDocument doc = web_frame->document();
293 ASSERT_TRUE(HasDocType(doc));
294 // Do serialization.
295 SerializeDomForURL(file_url, false);
296 // Load the serialized contents.
297 ASSERT_TRUE(HasSerializedFrame(file_url));
298 const std::string& serialized_contents =
299 GetSerializedContentForFrame(file_url);
300 LoadContents(serialized_contents, file_url,
301 web_frame->document().encoding());
302 // Make sure serialized contents still have document type.
303 web_frame = GetMainFrame();
304 doc = web_frame->document();
305 ASSERT_TRUE(HasDocType(doc));
308 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) {
309 // Make sure original contents do not have document type.
310 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
311 ASSERT_TRUE(web_frame != NULL);
312 WebDocument doc = web_frame->document();
313 ASSERT_TRUE(!HasDocType(doc));
314 // Do serialization.
315 SerializeDomForURL(file_url, false);
316 // Load the serialized contents.
317 ASSERT_TRUE(HasSerializedFrame(file_url));
318 const std::string& serialized_contents =
319 GetSerializedContentForFrame(file_url);
320 LoadContents(serialized_contents, file_url,
321 web_frame->document().encoding());
322 // Make sure serialized contents do not have document type.
323 web_frame = GetMainFrame();
324 doc = web_frame->document();
325 ASSERT_TRUE(!HasDocType(doc));
328 void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
329 const GURL& xml_file_url, const std::string& original_contents) {
330 // Do serialization.
331 SerializeDomForURL(xml_file_url, false);
332 // Compare the serialized contents with original contents.
333 ASSERT_TRUE(HasSerializedFrame(xml_file_url));
334 const std::string& serialized_contents =
335 GetSerializedContentForFrame(xml_file_url);
336 ASSERT_EQ(original_contents, serialized_contents);
339 void SerializeHTMLDOMWithAddingMOTWOnRenderer(
340 const GURL& file_url, const std::string& original_contents) {
341 // Make sure original contents does not have MOTW;
342 std::string motw_declaration =
343 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
344 ASSERT_FALSE(motw_declaration.empty());
345 // The encoding of original contents is ISO-8859-1, so we convert the MOTW
346 // declaration to ASCII and search whether original contents has it or not.
347 ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration));
349 // Do serialization.
350 SerializeDomForURL(file_url, false);
351 // Make sure the serialized contents have MOTW ;
352 ASSERT_TRUE(HasSerializedFrame(file_url));
353 const std::string& serialized_contents =
354 GetSerializedContentForFrame(file_url);
355 ASSERT_FALSE(std::string::npos ==
356 serialized_contents.find(motw_declaration));
359 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
360 const GURL& file_url) {
361 // Make sure there is no META charset declaration in original document.
362 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
363 ASSERT_TRUE(web_frame != NULL);
364 WebDocument doc = web_frame->document();
365 ASSERT_TRUE(doc.isHTMLDocument());
366 WebElement head_element = doc.head();
367 ASSERT_TRUE(!head_element.isNull());
368 // Go through all children of HEAD element.
369 for (WebNode child = head_element.firstChild(); !child.isNull();
370 child = child.nextSibling()) {
371 std::string charset_info;
372 if (IsMetaElement(child, charset_info))
373 ASSERT_TRUE(charset_info.empty());
375 // Do serialization.
376 SerializeDomForURL(file_url, false);
378 // Load the serialized contents.
379 ASSERT_TRUE(HasSerializedFrame(file_url));
380 const std::string& serialized_contents =
381 GetSerializedContentForFrame(file_url);
382 LoadContents(serialized_contents, file_url,
383 web_frame->document().encoding());
384 // Make sure the first child of HEAD element is META which has charset
385 // declaration in serialized contents.
386 web_frame = GetMainFrame();
387 ASSERT_TRUE(web_frame != NULL);
388 doc = web_frame->document();
389 ASSERT_TRUE(doc.isHTMLDocument());
390 head_element = doc.head();
391 ASSERT_TRUE(!head_element.isNull());
392 WebNode meta_node = head_element.firstChild();
393 ASSERT_TRUE(!meta_node.isNull());
394 // Get meta charset info.
395 std::string charset_info2;
396 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
397 ASSERT_TRUE(!charset_info2.empty());
398 ASSERT_EQ(charset_info2,
399 std::string(web_frame->document().encoding().utf8()));
401 // Make sure no more additional META tags which have charset declaration.
402 for (WebNode child = meta_node.nextSibling(); !child.isNull();
403 child = child.nextSibling()) {
404 std::string charset_info;
405 if (IsMetaElement(child, charset_info))
406 ASSERT_TRUE(charset_info.empty());
410 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
411 const GURL& file_url) {
412 // Make sure there are multiple META charset declarations in original
413 // document.
414 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
415 ASSERT_TRUE(web_frame != NULL);
416 WebDocument doc = web_frame->document();
417 ASSERT_TRUE(doc.isHTMLDocument());
418 WebElement head_ele = doc.head();
419 ASSERT_TRUE(!head_ele.isNull());
420 // Go through all children of HEAD element.
421 int charset_declaration_count = 0;
422 for (WebNode child = head_ele.firstChild(); !child.isNull();
423 child = child.nextSibling()) {
424 std::string charset_info;
425 if (IsMetaElement(child, charset_info) && !charset_info.empty())
426 charset_declaration_count++;
428 // The original doc has more than META tags which have charset declaration.
429 ASSERT_TRUE(charset_declaration_count > 1);
431 // Do serialization.
432 SerializeDomForURL(file_url, false);
434 // Load the serialized contents.
435 ASSERT_TRUE(HasSerializedFrame(file_url));
436 const std::string& serialized_contents =
437 GetSerializedContentForFrame(file_url);
438 LoadContents(serialized_contents, file_url,
439 web_frame->document().encoding());
440 // Make sure only first child of HEAD element is META which has charset
441 // declaration in serialized contents.
442 web_frame = GetMainFrame();
443 ASSERT_TRUE(web_frame != NULL);
444 doc = web_frame->document();
445 ASSERT_TRUE(doc.isHTMLDocument());
446 head_ele = doc.head();
447 ASSERT_TRUE(!head_ele.isNull());
448 WebNode meta_node = head_ele.firstChild();
449 ASSERT_TRUE(!meta_node.isNull());
450 // Get meta charset info.
451 std::string charset_info2;
452 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
453 ASSERT_TRUE(!charset_info2.empty());
454 ASSERT_EQ(charset_info2,
455 std::string(web_frame->document().encoding().utf8()));
457 // Make sure no more additional META tags which have charset declaration.
458 for (WebNode child = meta_node.nextSibling(); !child.isNull();
459 child = child.nextSibling()) {
460 std::string charset_info;
461 if (IsMetaElement(child, charset_info))
462 ASSERT_TRUE(charset_info.empty());
466 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
467 base::FilePath page_file_path = GetTestFilePath(
468 "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
469 // Get file URL. The URL is dummy URL to identify the following loading
470 // actions. The test content is in constant:original_contents.
471 GURL file_url = net::FilePathToFileURL(page_file_path);
472 ASSERT_TRUE(file_url.SchemeIsFile());
473 // Test contents.
474 static const char* const original_contents =
475 "<html><body>&amp;&lt;&gt;\"\'</body></html>";
476 // Load the test contents.
477 LoadContents(original_contents, file_url, WebString());
479 // Get BODY's text content in DOM.
480 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
481 ASSERT_TRUE(web_frame != NULL);
482 WebDocument doc = web_frame->document();
483 ASSERT_TRUE(doc.isHTMLDocument());
484 WebElement body_ele = doc.body();
485 ASSERT_TRUE(!body_ele.isNull());
486 WebNode text_node = body_ele.firstChild();
487 ASSERT_TRUE(text_node.isTextNode());
488 ASSERT_TRUE(std::string(text_node.nodeValue().utf8()) == "&<>\"\'");
489 // Do serialization.
490 SerializeDomForURL(file_url, false);
491 // Compare the serialized contents with original contents.
492 ASSERT_TRUE(HasSerializedFrame(file_url));
493 const std::string& serialized_contents =
494 GetSerializedContentForFrame(file_url);
495 // Compare the serialized contents with original contents to make sure
496 // they are same.
497 // Because we add MOTW when serializing DOM, so before comparison, we also
498 // need to add MOTW to original_contents.
499 std::string original_str =
500 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
501 original_str += original_contents;
502 // Since WebCore now inserts a new HEAD element if there is no HEAD element
503 // when creating BODY element. (Please see
504 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
505 // corresponding META content if we find WebCore-generated HEAD element.
506 if (!doc.head().isNull()) {
507 WebString encoding = web_frame->document().encoding();
508 std::string htmlTag("<html>");
509 std::string::size_type pos = original_str.find(htmlTag);
510 ASSERT_NE(std::string::npos, pos);
511 pos += htmlTag.length();
512 std::string head_part("<head>");
513 head_part +=
514 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
515 head_part += "</head>";
516 original_str.insert(pos, head_part);
518 ASSERT_EQ(original_str, serialized_contents);
521 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
522 base::FilePath page_file_path = GetTestFilePath(
523 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
524 // Get file URL. The URL is dummy URL to identify the following loading
525 // actions. The test content is in constant:original_contents.
526 GURL file_url = net::FilePathToFileURL(page_file_path);
527 ASSERT_TRUE(file_url.SchemeIsFile());
528 // Test contents.
529 static const char* const original_contents =
530 "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
531 // Load the test contents.
532 LoadContents(original_contents, file_url, WebString());
533 // Get value of BODY's title attribute in DOM.
534 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
535 ASSERT_TRUE(web_frame != NULL);
536 WebDocument doc = web_frame->document();
537 ASSERT_TRUE(doc.isHTMLDocument());
538 WebElement body_ele = doc.body();
539 ASSERT_TRUE(!body_ele.isNull());
540 WebString value = body_ele.getAttribute("title");
541 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
542 // Do serialization.
543 SerializeDomForURL(file_url, false);
544 // Compare the serialized contents with original contents.
545 ASSERT_TRUE(HasSerializedFrame(file_url));
546 const std::string& serialized_contents =
547 GetSerializedContentForFrame(file_url);
548 // Compare the serialized contents with original contents to make sure
549 // they are same.
550 std::string original_str =
551 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
552 original_str += original_contents;
553 if (!doc.isNull()) {
554 WebString encoding = web_frame->document().encoding();
555 std::string htmlTag("<html>");
556 std::string::size_type pos = original_str.find(htmlTag);
557 ASSERT_NE(std::string::npos, pos);
558 pos += htmlTag.length();
559 std::string head_part("<head>");
560 head_part +=
561 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
562 head_part += "</head>";
563 original_str.insert(pos, head_part);
565 ASSERT_EQ(original_str, serialized_contents);
568 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) {
569 // Get value of BODY's title attribute in DOM.
570 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
571 WebDocument doc = web_frame->document();
572 ASSERT_TRUE(doc.isHTMLDocument());
573 WebElement body_element = doc.body();
574 // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
575 static const wchar_t parsed_value[] = {
576 '%', 0x2285, 0x00b9, '\'', 0
578 WebString value = body_element.getAttribute("title");
579 WebString content = doc.contentAsTextForTesting();
580 ASSERT_TRUE(base::UTF16ToWide(value) == parsed_value);
581 ASSERT_TRUE(base::UTF16ToWide(content) == parsed_value);
583 // Do serialization.
584 SerializeDomForURL(file_url, false);
585 // Check the serialized string.
586 ASSERT_TRUE(HasSerializedFrame(file_url));
587 const std::string& serialized_contents =
588 GetSerializedContentForFrame(file_url);
589 // Confirm that the serialized string has no non-standard HTML entities.
590 ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
591 ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
592 ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
593 ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
596 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url,
597 const GURL& path_dir_url) {
598 // There are total 2 available base tags in this test file.
599 const int kTotalBaseTagCountInTestFile = 2;
601 // Since for this test, we assume there is no savable sub-resource links for
602 // this test file, also all links are relative URLs in this test file, so we
603 // need to check those relative URLs and make sure document has BASE tag.
604 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
605 ASSERT_TRUE(web_frame != NULL);
606 WebDocument doc = web_frame->document();
607 ASSERT_TRUE(doc.isHTMLDocument());
608 // Go through all descent nodes.
609 WebElementCollection all = doc.all();
610 int original_base_tag_count = 0;
611 for (WebElement element = all.firstItem(); !element.isNull();
612 element = all.nextItem()) {
613 if (element.hasHTMLTagName("base")) {
614 original_base_tag_count++;
615 } else {
616 // Get link.
617 WebString value = GetSubResourceLinkFromElement(element);
618 if (value.isNull() && element.hasHTMLTagName("a")) {
619 value = element.getAttribute("href");
620 if (value.isEmpty())
621 value = WebString();
623 // Each link is relative link.
624 if (!value.isNull()) {
625 GURL link(value.utf8());
626 ASSERT_TRUE(link.scheme().empty());
630 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
631 // Make sure in original document, the base URL is not equal with the
632 // |path_dir_url|.
633 GURL original_base_url(doc.baseURL());
634 ASSERT_NE(original_base_url, path_dir_url);
636 // Do serialization.
637 SerializeDomForURL(file_url, false);
639 // Load the serialized contents.
640 ASSERT_TRUE(HasSerializedFrame(file_url));
641 const std::string& serialized_contents =
642 GetSerializedContentForFrame(file_url);
643 LoadContents(serialized_contents, file_url,
644 web_frame->document().encoding());
646 // Make sure all links are absolute URLs and doc there are some number of
647 // BASE tags in serialized HTML data. Each of those BASE tags have same base
648 // URL which is as same as URL of current test file.
649 web_frame = GetMainFrame();
650 ASSERT_TRUE(web_frame != NULL);
651 doc = web_frame->document();
652 ASSERT_TRUE(doc.isHTMLDocument());
653 // Go through all descent nodes.
654 all = doc.all();
655 int new_base_tag_count = 0;
656 for (WebNode node = all.firstItem(); !node.isNull();
657 node = all.nextItem()) {
658 if (!node.isElementNode())
659 continue;
660 WebElement element = node.to<WebElement>();
661 if (element.hasHTMLTagName("base")) {
662 new_base_tag_count++;
663 } else {
664 // Get link.
665 WebString value = GetSubResourceLinkFromElement(element);
666 if (value.isNull() && element.hasHTMLTagName("a")) {
667 value = element.getAttribute("href");
668 if (value.isEmpty())
669 value = WebString();
671 // Each link is absolute link.
672 if (!value.isNull()) {
673 GURL link(std::string(value.utf8()));
674 ASSERT_FALSE(link.scheme().empty());
678 // We have one more added BASE tag which is generated by JavaScript.
679 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
680 // Make sure in new document, the base URL is equal with the |path_dir_url|.
681 GURL new_base_url(doc.baseURL());
682 ASSERT_EQ(new_base_url, path_dir_url);
685 void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
686 base::FilePath page_file_path = GetTestFilePath(
687 "dom_serializer", "empty_head.htm");
688 GURL file_url = net::FilePathToFileURL(page_file_path);
689 ASSERT_TRUE(file_url.SchemeIsFile());
691 // Load the test html content.
692 static const char* const empty_head_contents =
693 "<html><head></head><body>hello world</body></html>";
694 LoadContents(empty_head_contents, file_url, WebString());
696 // Make sure the head tag is empty.
697 WebFrame* web_frame = GetMainFrame();
698 ASSERT_TRUE(web_frame != NULL);
699 WebDocument doc = web_frame->document();
700 ASSERT_TRUE(doc.isHTMLDocument());
701 WebElement head_element = doc.head();
702 ASSERT_TRUE(!head_element.isNull());
703 ASSERT_TRUE(!head_element.hasChildNodes());
704 ASSERT_TRUE(head_element.childNodes().length() == 0);
706 // Do serialization.
707 SerializeDomForURL(file_url, false);
708 // Make sure the serialized contents have META ;
709 ASSERT_TRUE(HasSerializedFrame(file_url));
710 const std::string& serialized_contents =
711 GetSerializedContentForFrame(file_url);
713 // Reload serialized contents and make sure there is only one META tag.
714 LoadContents(serialized_contents, file_url,
715 web_frame->document().encoding());
716 web_frame = GetMainFrame();
717 ASSERT_TRUE(web_frame != NULL);
718 doc = web_frame->document();
719 ASSERT_TRUE(doc.isHTMLDocument());
720 head_element = doc.head();
721 ASSERT_TRUE(!head_element.isNull());
722 ASSERT_TRUE(head_element.hasChildNodes());
723 ASSERT_TRUE(head_element.childNodes().length() == 1);
724 WebNode meta_node = head_element.firstChild();
725 ASSERT_TRUE(!meta_node.isNull());
726 // Get meta charset info.
727 std::string charset_info;
728 ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
729 ASSERT_TRUE(!charset_info.empty());
730 ASSERT_EQ(charset_info,
731 std::string(web_frame->document().encoding().utf8()));
733 // Check the body's first node is text node and its contents are
734 // "hello world"
735 WebElement body_element = doc.body();
736 ASSERT_TRUE(!body_element.isNull());
737 WebNode text_node = body_element.firstChild();
738 ASSERT_TRUE(text_node.isTextNode());
739 WebString text_node_contents = text_node.nodeValue();
740 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
743 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) {
744 // Do a recursive serialization. We pass if we don't crash.
745 SerializeDomForURL(file_url, true);
748 void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
749 const GURL& file_url) {
750 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
751 ASSERT_TRUE(web_frame != NULL);
752 WebDocument doc = web_frame->document();
753 WebNode lastNodeInBody = doc.body().lastChild();
754 ASSERT_TRUE(lastNodeInBody.isElementNode());
755 WebString uri = GetSubResourceLinkFromElement(
756 lastNodeInBody.to<WebElement>());
757 EXPECT_TRUE(uri.isNull());
760 private:
761 int32 render_view_routing_id_;
762 // Map frame_url to corresponding serialized_content.
763 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
764 SerializedFrameContentMap serialized_frame_map_;
765 // Map frame_url to corresponding status of serialization finish.
766 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
767 SerializationFinishStatusMap serialization_finish_status_;
768 // Flag indicates whether the process of serializing DOM is finished or not.
769 bool serialized_;
770 // The local_directory_name_ is dummy relative path of directory which
771 // contain all saved auxiliary files included all sub frames and resources.
772 const base::FilePath local_directory_name_;
775 // If original contents have document type, the serialized contents also have
776 // document type.
777 // Disabled by ellyjones@ on 2015-05-18, see https://crbug.com/488495.
778 #if defined(OS_MACOSX)
779 #define MAYBE_SerializeHTMLDOMWithDocType DISABLED_SerializeHTMLDOMWithDocType
780 #else
781 #define MAYBE_SerializeHTMLDOMWithDocType SerializeHTMLDOMWithDocType
782 #endif
784 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
785 MAYBE_SerializeHTMLDOMWithDocType) {
786 base::FilePath page_file_path =
787 GetTestFilePath("dom_serializer", "youtube_1.htm");
788 GURL file_url = net::FilePathToFileURL(page_file_path);
789 ASSERT_TRUE(file_url.SchemeIsFile());
790 // Load the test file.
791 NavigateToURL(shell(), file_url);
793 PostTaskToInProcessRendererAndWait(
794 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer,
795 base::Unretained(this), file_url));
798 // If original contents do not have document type, the serialized contents
799 // also do not have document type.
800 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
801 base::FilePath page_file_path =
802 GetTestFilePath("dom_serializer", "youtube_2.htm");
803 GURL file_url = net::FilePathToFileURL(page_file_path);
804 ASSERT_TRUE(file_url.SchemeIsFile());
805 // Load the test file.
806 NavigateToURL(shell(), file_url);
808 PostTaskToInProcessRendererAndWait(
809 base::Bind(
810 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer,
811 base::Unretained(this), file_url));
814 // Serialize XML document which has all 5 built-in entities. After
815 // finishing serialization, the serialized contents should be same
816 // with original XML document.
818 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
819 // XML headers are handled differently in the merged serializer.
820 // Bug: http://crbug.com/328354
821 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
822 DISABLED_SerializeXMLDocWithBuiltInEntities) {
823 base::FilePath page_file_path =
824 GetTestFilePath("dom_serializer", "note.html");
825 base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml");
826 // Read original contents for later comparison.
827 std::string original_contents;
828 ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents));
829 // Get file URL.
830 GURL file_url = net::FilePathToFileURL(page_file_path);
831 GURL xml_file_url = net::FilePathToFileURL(xml_file_path);
832 ASSERT_TRUE(file_url.SchemeIsFile());
833 // Load the test file.
834 NavigateToURL(shell(), file_url);
836 PostTaskToInProcessRendererAndWait(
837 base::Bind(
838 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer,
839 base::Unretained(this), xml_file_url, original_contents));
842 // When serializing DOM, we add MOTW declaration before html tag.
843 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
844 base::FilePath page_file_path =
845 GetTestFilePath("dom_serializer", "youtube_2.htm");
846 // Read original contents for later comparison .
847 std::string original_contents;
848 ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents));
849 // Get file URL.
850 GURL file_url = net::FilePathToFileURL(page_file_path);
851 ASSERT_TRUE(file_url.SchemeIsFile());
853 // Load the test file.
854 NavigateToURL(shell(), file_url);
856 PostTaskToInProcessRendererAndWait(
857 base::Bind(
858 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer,
859 base::Unretained(this), file_url, original_contents));
862 // When serializing DOM, we will add the META which have correct charset
863 // declaration as first child of HEAD element for resolving WebKit bug:
864 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
865 // does not have META charset declaration.
866 // Disabled by battre@ on 2015-05-21, see https://crbug.com/488495.
867 #if defined(OS_MACOSX)
868 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
869 DISABLED_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
870 #else
871 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
872 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
873 #endif
874 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
875 MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
876 base::FilePath page_file_path =
877 GetTestFilePath("dom_serializer", "youtube_1.htm");
878 // Get file URL.
879 GURL file_url = net::FilePathToFileURL(page_file_path);
880 ASSERT_TRUE(file_url.SchemeIsFile());
881 // Load the test file.
882 NavigateToURL(shell(), file_url);
884 PostTaskToInProcessRendererAndWait(
885 base::Bind(
886 &DomSerializerTests::
887 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer,
888 base::Unretained(this), file_url));
891 // When serializing DOM, if the original document has multiple META charset
892 // declaration, we will add the META which have correct charset declaration
893 // as first child of HEAD element and remove all original META charset
894 // declarations.
895 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
896 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
897 base::FilePath page_file_path =
898 GetTestFilePath("dom_serializer", "youtube_2.htm");
899 // Get file URL.
900 GURL file_url = net::FilePathToFileURL(page_file_path);
901 ASSERT_TRUE(file_url.SchemeIsFile());
902 // Load the test file.
903 NavigateToURL(shell(), file_url);
905 PostTaskToInProcessRendererAndWait(
906 base::Bind(
907 &DomSerializerTests::
908 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer,
909 base::Unretained(this), file_url));
912 // Test situation of html entities in text when serializing HTML DOM.
913 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
914 // Need to spin up the renderer and also navigate to a file url so that the
915 // renderer code doesn't attempt a fork when it sees a load to file scheme
916 // from non-file scheme.
917 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
919 PostTaskToInProcessRendererAndWait(
920 base::Bind(
921 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer,
922 base::Unretained(this)));
925 // Test situation of html entities in attribute value when serializing
926 // HTML DOM.
927 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
929 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
930 // Some attributes are handled differently in the merged serializer.
931 // Bug: http://crbug.com/328354
932 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
933 DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue) {
934 // Need to spin up the renderer and also navigate to a file url so that the
935 // renderer code doesn't attempt a fork when it sees a load to file scheme
936 // from non-file scheme.
937 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
939 PostTaskToInProcessRendererAndWait(
940 base::Bind(
941 &DomSerializerTests::
942 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer,
943 base::Unretained(this)));
946 // Test situation of non-standard HTML entities when serializing HTML DOM.
947 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
948 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
949 SerializeHTMLDOMWithNonStandardEntities) {
950 // Make a test file URL and load it.
951 base::FilePath page_file_path = GetTestFilePath(
952 "dom_serializer", "nonstandard_htmlentities.htm");
953 GURL file_url = net::FilePathToFileURL(page_file_path);
954 NavigateToURL(shell(), file_url);
956 PostTaskToInProcessRendererAndWait(
957 base::Bind(
958 &DomSerializerTests::
959 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer,
960 base::Unretained(this), file_url));
963 // Test situation of BASE tag in original document when serializing HTML DOM.
964 // When serializing, we should comment the BASE tag, append a new BASE tag.
965 // rewrite all the savable URLs to relative local path, and change other URLs
966 // to absolute URLs.
968 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
969 // Base tags are handled a bit different in merged version.
970 // Bug: http://crbug.com/328354
971 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
972 DISABLED_SerializeHTMLDOMWithBaseTag) {
973 base::FilePath page_file_path = GetTestFilePath(
974 "dom_serializer", "html_doc_has_base_tag.htm");
976 // Get page dir URL which is base URL of this file.
977 base::FilePath dir_name = page_file_path.DirName();
978 dir_name = dir_name.Append(
979 base::FilePath::StringType(base::FilePath::kSeparators[0], 1));
980 GURL path_dir_url = net::FilePathToFileURL(dir_name);
982 // Get file URL.
983 GURL file_url = net::FilePathToFileURL(page_file_path);
984 ASSERT_TRUE(file_url.SchemeIsFile());
985 // Load the test file.
986 NavigateToURL(shell(), file_url);
988 PostTaskToInProcessRendererAndWait(
989 base::Bind(
990 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer,
991 base::Unretained(this), file_url, path_dir_url));
994 // Serializing page which has an empty HEAD tag.
995 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
996 // Need to spin up the renderer and also navigate to a file url so that the
997 // renderer code doesn't attempt a fork when it sees a load to file scheme
998 // from non-file scheme.
999 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
1001 PostTaskToInProcessRendererAndWait(
1002 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer,
1003 base::Unretained(this)));
1006 // Test that we don't crash when the page contains an iframe that
1007 // was handled as a download (http://crbug.com/42212).
1008 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
1009 SerializeDocumentWithDownloadedIFrame) {
1010 base::FilePath page_file_path = GetTestFilePath(
1011 "dom_serializer", "iframe-src-is-exe.htm");
1012 GURL file_url = net::FilePathToFileURL(page_file_path);
1013 ASSERT_TRUE(file_url.SchemeIsFile());
1014 // Load the test file.
1015 NavigateToURL(shell(), file_url);
1017 PostTaskToInProcessRendererAndWait(
1018 base::Bind(
1019 &DomSerializerTests::
1020 SerializeDocumentWithDownloadedIFrameOnRenderer,
1021 base::Unretained(this), file_url));
1024 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
1025 SubResourceForElementsInNonHTMLNamespace) {
1026 base::FilePath page_file_path = GetTestFilePath(
1027 "dom_serializer", "non_html_namespace.htm");
1028 GURL file_url = net::FilePathToFileURL(page_file_path);
1029 NavigateToURL(shell(), file_url);
1031 PostTaskToInProcessRendererAndWait(
1032 base::Bind(
1033 &DomSerializerTests::
1034 SubResourceForElementsInNonHTMLNamespaceOnRenderer,
1035 base::Unretained(this), file_url));
1038 } // namespace content