Re-subimission of https://codereview.chromium.org/1041213003/
[chromium-blink-merge.git] / content / renderer / dom_serializer_browsertest.cc
blobb1759421b6c96b56c7f3312fbc19f8ec99c882cf
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/bind.h"
6 #include "base/command_line.h"
7 #include "base/compiler_specific.h"
8 #include "base/containers/hash_tables.h"
9 #include "base/files/file_path.h"
10 #include "base/files/file_util.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "content/public/common/content_switches.h"
14 #include "content/public/renderer/render_view.h"
15 #include "content/public/renderer/render_view_observer.h"
16 #include "content/public/test/content_browser_test.h"
17 #include "content/public/test/content_browser_test_utils.h"
18 #include "content/public/test/test_utils.h"
19 #include "content/renderer/savable_resources.h"
20 #include "content/shell/browser/shell.h"
21 #include "net/base/filename_util.h"
22 #include "net/url_request/url_request_context.h"
23 #include "third_party/WebKit/public/platform/WebCString.h"
24 #include "third_party/WebKit/public/platform/WebData.h"
25 #include "third_party/WebKit/public/platform/WebString.h"
26 #include "third_party/WebKit/public/platform/WebURL.h"
27 #include "third_party/WebKit/public/platform/WebVector.h"
28 #include "third_party/WebKit/public/web/WebDocument.h"
29 #include "third_party/WebKit/public/web/WebElement.h"
30 #include "third_party/WebKit/public/web/WebElementCollection.h"
31 #include "third_party/WebKit/public/web/WebLocalFrame.h"
32 #include "third_party/WebKit/public/web/WebNode.h"
33 #include "third_party/WebKit/public/web/WebNodeList.h"
34 #include "third_party/WebKit/public/web/WebPageSerializer.h"
35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
36 #include "third_party/WebKit/public/web/WebView.h"
38 using blink::WebCString;
39 using blink::WebData;
40 using blink::WebDocument;
41 using blink::WebElement;
42 using blink::WebElementCollection;
43 using blink::WebFrame;
44 using blink::WebLocalFrame;
45 using blink::WebNode;
46 using blink::WebNodeList;
47 using blink::WebPageSerializer;
48 using blink::WebPageSerializerClient;
49 using blink::WebString;
50 using blink::WebURL;
51 using blink::WebView;
52 using blink::WebVector;
54 namespace {
56 // The first RenderFrame is routing ID 1, and the first RenderView is 2.
57 const int kRenderViewRoutingId = 2;
61 namespace content {
63 // Iterate recursively over sub-frames to find one with with a given url.
64 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
65 if (!web_view->mainFrame())
66 return NULL;
68 std::vector<WebFrame*> stack;
69 stack.push_back(web_view->mainFrame());
71 while (!stack.empty()) {
72 WebFrame* current_frame = stack.back();
73 stack.pop_back();
74 if (GURL(current_frame->document().url()) == url)
75 return current_frame;
76 WebElementCollection all = current_frame->document().all();
77 for (WebElement element = all.firstItem();
78 !element.isNull(); element = all.nextItem()) {
79 // Check frame tag and iframe tag
80 if (!element.hasHTMLTagName("frame") && !element.hasHTMLTagName("iframe"))
81 continue;
82 WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
83 if (sub_frame)
84 stack.push_back(sub_frame);
87 return NULL;
90 // Helper function that test whether the first node in the doc is a doc type
91 // node.
92 bool HasDocType(const WebDocument& doc) {
93 WebNode node = doc.firstChild();
94 if (node.isNull())
95 return false;
96 return node.nodeType() == WebNode::DocumentTypeNode;
99 // Helper function for checking whether input node is META tag. Return true
100 // means it is META element, otherwise return false. The parameter charset_info
101 // return actual charset info if the META tag has charset declaration.
102 bool IsMetaElement(const WebNode& node, std::string& charset_info) {
103 if (!node.isElementNode())
104 return false;
105 const WebElement meta = node.toConst<WebElement>();
106 if (!meta.hasHTMLTagName("meta"))
107 return false;
108 charset_info.erase(0, charset_info.length());
109 // Check the META charset declaration.
110 WebString httpEquiv = meta.getAttribute("http-equiv");
111 if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
112 std::string content = meta.getAttribute("content").utf8();
113 int pos = content.find("charset", 0);
114 if (pos > -1) {
115 // Add a dummy charset declaration to charset_info, which indicates this
116 // META tag has charset declaration although we do not get correct value
117 // yet.
118 charset_info.append("has-charset-declaration");
119 int remaining_length = content.length() - pos - 7;
120 if (!remaining_length)
121 return true;
122 int start_pos = pos + 7;
123 // Find "=" symbol.
124 while (remaining_length--)
125 if (content[start_pos++] == L'=')
126 break;
127 // Skip beginning space.
128 while (remaining_length) {
129 if (content[start_pos] > 0x0020)
130 break;
131 ++start_pos;
132 --remaining_length;
134 if (!remaining_length)
135 return true;
136 int end_pos = start_pos;
137 // Now we find out the start point of charset info. Search the end point.
138 while (remaining_length--) {
139 if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
140 break;
141 ++end_pos;
143 // Get actual charset info.
144 charset_info = content.substr(start_pos, end_pos - start_pos);
145 return true;
148 return true;
151 class LoadObserver : public RenderViewObserver {
152 public:
153 LoadObserver(RenderView* render_view, const base::Closure& quit_closure)
154 : RenderViewObserver(render_view),
155 quit_closure_(quit_closure) {}
157 void DidFinishLoad(blink::WebLocalFrame* frame) override {
158 if (frame == render_view()->GetWebView()->mainFrame())
159 quit_closure_.Run();
162 private:
163 base::Closure quit_closure_;
166 class DomSerializerTests : public ContentBrowserTest,
167 public WebPageSerializerClient {
168 public:
169 DomSerializerTests()
170 : serialized_(false),
171 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
173 void SetUpCommandLine(base::CommandLine* command_line) override {
174 command_line->AppendSwitch(switches::kSingleProcess);
175 #if defined(OS_WIN)
176 // Don't want to try to create a GPU process.
177 command_line->AppendSwitch(switches::kDisableGpu);
178 #endif
181 // DomSerializerDelegate.
182 virtual void didSerializeDataForFrame(const WebURL& frame_web_url,
183 const WebCString& data,
184 PageSerializationStatus status) {
186 GURL frame_url(frame_web_url);
187 // If the all frames are finished saving, check all finish status
188 if (status == WebPageSerializerClient::AllFramesAreFinished) {
189 SerializationFinishStatusMap::iterator it =
190 serialization_finish_status_.begin();
191 for (; it != serialization_finish_status_.end(); ++it)
192 ASSERT_TRUE(it->second);
193 serialized_ = true;
194 return;
197 // Check finish status of current frame.
198 SerializationFinishStatusMap::iterator it =
199 serialization_finish_status_.find(frame_url.spec());
200 // New frame, set initial status as false.
201 if (it == serialization_finish_status_.end())
202 serialization_finish_status_[frame_url.spec()] = false;
204 it = serialization_finish_status_.find(frame_url.spec());
205 ASSERT_TRUE(it != serialization_finish_status_.end());
206 // In process frame, finish status should be false.
207 ASSERT_FALSE(it->second);
209 // Add data to corresponding frame's content.
210 serialized_frame_map_[frame_url.spec()] += data.data();
212 // Current frame is completed saving, change the finish status.
213 if (status == WebPageSerializerClient::CurrentFrameIsFinished)
214 it->second = true;
217 bool HasSerializedFrame(const GURL& frame_url) {
218 return serialized_frame_map_.find(frame_url.spec()) !=
219 serialized_frame_map_.end();
222 const std::string& GetSerializedContentForFrame(
223 const GURL& frame_url) {
224 return serialized_frame_map_[frame_url.spec()];
227 RenderView* GetRenderView() {
228 // We could have the test on the UI thread get the WebContent's routing ID,
229 // but we know this will be the first RV so skip that and just hardcode it.
230 return RenderView::FromRoutingID(kRenderViewRoutingId);
233 WebView* GetWebView() {
234 return GetRenderView()->GetWebView();
237 WebFrame* GetMainFrame() {
238 return GetWebView()->mainFrame();
241 // Load web page according to input content and relative URLs within
242 // the document.
243 void LoadContents(const std::string& contents,
244 const GURL& base_url,
245 const WebString encoding_info) {
246 scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner;
247 LoadObserver observer(GetRenderView(), runner->QuitClosure());
249 // If input encoding is empty, use UTF-8 as default encoding.
250 if (encoding_info.isEmpty()) {
251 GetMainFrame()->loadHTMLString(contents, base_url);
252 } else {
253 WebData data(contents.data(), contents.length());
255 // Do not use WebFrame.LoadHTMLString because it assumes that input
256 // html contents use UTF-8 encoding.
257 // TODO(darin): This should use WebFrame::loadData.
258 WebFrame* web_frame = GetMainFrame();
260 ASSERT_TRUE(web_frame != NULL);
262 web_frame->loadData(data, "text/html", encoding_info, base_url);
265 runner->Run();
268 // Serialize page DOM according to specific page URL. The parameter
269 // recursive_serialization indicates whether we will serialize all
270 // sub-frames.
271 void SerializeDomForURL(const GURL& page_url,
272 bool recursive_serialization) {
273 // Find corresponding WebFrame according to page_url.
274 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url);
275 ASSERT_TRUE(web_frame != NULL);
276 WebVector<WebURL> links;
277 links.assign(&page_url, 1);
278 WebString file_path =
279 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
280 WebVector<WebString> local_paths;
281 local_paths.assign(&file_path, 1);
282 // Start serializing DOM.
283 bool result = WebPageSerializer::serialize(web_frame->toWebLocalFrame(),
284 recursive_serialization,
285 static_cast<WebPageSerializerClient*>(this),
286 links,
287 local_paths,
288 local_directory_name_.AsUTF16Unsafe());
289 ASSERT_TRUE(result);
290 ASSERT_TRUE(serialized_);
293 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) {
294 // Make sure original contents have document type.
295 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
296 ASSERT_TRUE(web_frame != NULL);
297 WebDocument doc = web_frame->document();
298 ASSERT_TRUE(HasDocType(doc));
299 // Do serialization.
300 SerializeDomForURL(file_url, false);
301 // Load the serialized contents.
302 ASSERT_TRUE(HasSerializedFrame(file_url));
303 const std::string& serialized_contents =
304 GetSerializedContentForFrame(file_url);
305 LoadContents(serialized_contents, file_url,
306 web_frame->document().encoding());
307 // Make sure serialized contents still have document type.
308 web_frame = GetMainFrame();
309 doc = web_frame->document();
310 ASSERT_TRUE(HasDocType(doc));
313 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) {
314 // Make sure original contents do not have document type.
315 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
316 ASSERT_TRUE(web_frame != NULL);
317 WebDocument doc = web_frame->document();
318 ASSERT_TRUE(!HasDocType(doc));
319 // Do serialization.
320 SerializeDomForURL(file_url, false);
321 // Load the serialized contents.
322 ASSERT_TRUE(HasSerializedFrame(file_url));
323 const std::string& serialized_contents =
324 GetSerializedContentForFrame(file_url);
325 LoadContents(serialized_contents, file_url,
326 web_frame->document().encoding());
327 // Make sure serialized contents do not have document type.
328 web_frame = GetMainFrame();
329 doc = web_frame->document();
330 ASSERT_TRUE(!HasDocType(doc));
333 void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
334 const GURL& xml_file_url, const std::string& original_contents) {
335 // Do serialization.
336 SerializeDomForURL(xml_file_url, false);
337 // Compare the serialized contents with original contents.
338 ASSERT_TRUE(HasSerializedFrame(xml_file_url));
339 const std::string& serialized_contents =
340 GetSerializedContentForFrame(xml_file_url);
341 ASSERT_EQ(original_contents, serialized_contents);
344 void SerializeHTMLDOMWithAddingMOTWOnRenderer(
345 const GURL& file_url, const std::string& original_contents) {
346 // Make sure original contents does not have MOTW;
347 std::string motw_declaration =
348 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
349 ASSERT_FALSE(motw_declaration.empty());
350 // The encoding of original contents is ISO-8859-1, so we convert the MOTW
351 // declaration to ASCII and search whether original contents has it or not.
352 ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration));
354 // Do serialization.
355 SerializeDomForURL(file_url, false);
356 // Make sure the serialized contents have MOTW ;
357 ASSERT_TRUE(HasSerializedFrame(file_url));
358 const std::string& serialized_contents =
359 GetSerializedContentForFrame(file_url);
360 ASSERT_FALSE(std::string::npos ==
361 serialized_contents.find(motw_declaration));
364 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
365 const GURL& file_url) {
366 // Make sure there is no META charset declaration in original document.
367 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
368 ASSERT_TRUE(web_frame != NULL);
369 WebDocument doc = web_frame->document();
370 ASSERT_TRUE(doc.isHTMLDocument());
371 WebElement head_element = doc.head();
372 ASSERT_TRUE(!head_element.isNull());
373 // Go through all children of HEAD element.
374 for (WebNode child = head_element.firstChild(); !child.isNull();
375 child = child.nextSibling()) {
376 std::string charset_info;
377 if (IsMetaElement(child, charset_info))
378 ASSERT_TRUE(charset_info.empty());
380 // Do serialization.
381 SerializeDomForURL(file_url, false);
383 // Load the serialized contents.
384 ASSERT_TRUE(HasSerializedFrame(file_url));
385 const std::string& serialized_contents =
386 GetSerializedContentForFrame(file_url);
387 LoadContents(serialized_contents, file_url,
388 web_frame->document().encoding());
389 // Make sure the first child of HEAD element is META which has charset
390 // declaration in serialized contents.
391 web_frame = GetMainFrame();
392 ASSERT_TRUE(web_frame != NULL);
393 doc = web_frame->document();
394 ASSERT_TRUE(doc.isHTMLDocument());
395 head_element = doc.head();
396 ASSERT_TRUE(!head_element.isNull());
397 WebNode meta_node = head_element.firstChild();
398 ASSERT_TRUE(!meta_node.isNull());
399 // Get meta charset info.
400 std::string charset_info2;
401 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
402 ASSERT_TRUE(!charset_info2.empty());
403 ASSERT_EQ(charset_info2,
404 std::string(web_frame->document().encoding().utf8()));
406 // Make sure no more additional META tags which have charset declaration.
407 for (WebNode child = meta_node.nextSibling(); !child.isNull();
408 child = child.nextSibling()) {
409 std::string charset_info;
410 if (IsMetaElement(child, charset_info))
411 ASSERT_TRUE(charset_info.empty());
415 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
416 const GURL& file_url) {
417 // Make sure there are multiple META charset declarations in original
418 // document.
419 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
420 ASSERT_TRUE(web_frame != NULL);
421 WebDocument doc = web_frame->document();
422 ASSERT_TRUE(doc.isHTMLDocument());
423 WebElement head_ele = doc.head();
424 ASSERT_TRUE(!head_ele.isNull());
425 // Go through all children of HEAD element.
426 int charset_declaration_count = 0;
427 for (WebNode child = head_ele.firstChild(); !child.isNull();
428 child = child.nextSibling()) {
429 std::string charset_info;
430 if (IsMetaElement(child, charset_info) && !charset_info.empty())
431 charset_declaration_count++;
433 // The original doc has more than META tags which have charset declaration.
434 ASSERT_TRUE(charset_declaration_count > 1);
436 // Do serialization.
437 SerializeDomForURL(file_url, false);
439 // Load the serialized contents.
440 ASSERT_TRUE(HasSerializedFrame(file_url));
441 const std::string& serialized_contents =
442 GetSerializedContentForFrame(file_url);
443 LoadContents(serialized_contents, file_url,
444 web_frame->document().encoding());
445 // Make sure only first child of HEAD element is META which has charset
446 // declaration in serialized contents.
447 web_frame = GetMainFrame();
448 ASSERT_TRUE(web_frame != NULL);
449 doc = web_frame->document();
450 ASSERT_TRUE(doc.isHTMLDocument());
451 head_ele = doc.head();
452 ASSERT_TRUE(!head_ele.isNull());
453 WebNode meta_node = head_ele.firstChild();
454 ASSERT_TRUE(!meta_node.isNull());
455 // Get meta charset info.
456 std::string charset_info2;
457 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
458 ASSERT_TRUE(!charset_info2.empty());
459 ASSERT_EQ(charset_info2,
460 std::string(web_frame->document().encoding().utf8()));
462 // Make sure no more additional META tags which have charset declaration.
463 for (WebNode child = meta_node.nextSibling(); !child.isNull();
464 child = child.nextSibling()) {
465 std::string charset_info;
466 if (IsMetaElement(child, charset_info))
467 ASSERT_TRUE(charset_info.empty());
471 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
472 base::FilePath page_file_path = GetTestFilePath(
473 "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
474 // Get file URL. The URL is dummy URL to identify the following loading
475 // actions. The test content is in constant:original_contents.
476 GURL file_url = net::FilePathToFileURL(page_file_path);
477 ASSERT_TRUE(file_url.SchemeIsFile());
478 // Test contents.
479 static const char* const original_contents =
480 "<html><body>&amp;&lt;&gt;\"\'</body></html>";
481 // Load the test contents.
482 LoadContents(original_contents, file_url, WebString());
484 // Get BODY's text content in DOM.
485 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
486 ASSERT_TRUE(web_frame != NULL);
487 WebDocument doc = web_frame->document();
488 ASSERT_TRUE(doc.isHTMLDocument());
489 WebElement body_ele = doc.body();
490 ASSERT_TRUE(!body_ele.isNull());
491 WebNode text_node = body_ele.firstChild();
492 ASSERT_TRUE(text_node.isTextNode());
493 ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
494 "&amp;&lt;&gt;\"\'");
495 // Do serialization.
496 SerializeDomForURL(file_url, false);
497 // Compare the serialized contents with original contents.
498 ASSERT_TRUE(HasSerializedFrame(file_url));
499 const std::string& serialized_contents =
500 GetSerializedContentForFrame(file_url);
501 // Compare the serialized contents with original contents to make sure
502 // they are same.
503 // Because we add MOTW when serializing DOM, so before comparison, we also
504 // need to add MOTW to original_contents.
505 std::string original_str =
506 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
507 original_str += original_contents;
508 // Since WebCore now inserts a new HEAD element if there is no HEAD element
509 // when creating BODY element. (Please see
510 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
511 // corresponding META content if we find WebCore-generated HEAD element.
512 if (!doc.head().isNull()) {
513 WebString encoding = web_frame->document().encoding();
514 std::string htmlTag("<html>");
515 std::string::size_type pos = original_str.find(htmlTag);
516 ASSERT_NE(std::string::npos, pos);
517 pos += htmlTag.length();
518 std::string head_part("<head>");
519 head_part +=
520 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
521 head_part += "</head>";
522 original_str.insert(pos, head_part);
524 ASSERT_EQ(original_str, serialized_contents);
527 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
528 base::FilePath page_file_path = GetTestFilePath(
529 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
530 // Get file URL. The URL is dummy URL to identify the following loading
531 // actions. The test content is in constant:original_contents.
532 GURL file_url = net::FilePathToFileURL(page_file_path);
533 ASSERT_TRUE(file_url.SchemeIsFile());
534 // Test contents.
535 static const char* const original_contents =
536 "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
537 // Load the test contents.
538 LoadContents(original_contents, file_url, WebString());
539 // Get value of BODY's title attribute in DOM.
540 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
541 ASSERT_TRUE(web_frame != NULL);
542 WebDocument doc = web_frame->document();
543 ASSERT_TRUE(doc.isHTMLDocument());
544 WebElement body_ele = doc.body();
545 ASSERT_TRUE(!body_ele.isNull());
546 WebString value = body_ele.getAttribute("title");
547 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
548 // Do serialization.
549 SerializeDomForURL(file_url, false);
550 // Compare the serialized contents with original contents.
551 ASSERT_TRUE(HasSerializedFrame(file_url));
552 const std::string& serialized_contents =
553 GetSerializedContentForFrame(file_url);
554 // Compare the serialized contents with original contents to make sure
555 // they are same.
556 std::string original_str =
557 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
558 original_str += original_contents;
559 if (!doc.isNull()) {
560 WebString encoding = web_frame->document().encoding();
561 std::string htmlTag("<html>");
562 std::string::size_type pos = original_str.find(htmlTag);
563 ASSERT_NE(std::string::npos, pos);
564 pos += htmlTag.length();
565 std::string head_part("<head>");
566 head_part +=
567 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
568 head_part += "</head>";
569 original_str.insert(pos, head_part);
571 ASSERT_EQ(original_str, serialized_contents);
574 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) {
575 // Get value of BODY's title attribute in DOM.
576 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
577 WebDocument doc = web_frame->document();
578 ASSERT_TRUE(doc.isHTMLDocument());
579 WebElement body_element = doc.body();
580 // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
581 static const wchar_t parsed_value[] = {
582 '%', 0x2285, 0x00b9, '\'', 0
584 WebString value = body_element.getAttribute("title");
585 ASSERT_TRUE(base::UTF16ToWide(value) == parsed_value);
586 ASSERT_TRUE(base::UTF16ToWide(body_element.innerText()) == parsed_value);
588 // Do serialization.
589 SerializeDomForURL(file_url, false);
590 // Check the serialized string.
591 ASSERT_TRUE(HasSerializedFrame(file_url));
592 const std::string& serialized_contents =
593 GetSerializedContentForFrame(file_url);
594 // Confirm that the serialized string has no non-standard HTML entities.
595 ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
596 ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
597 ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
598 ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
601 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url,
602 const GURL& path_dir_url) {
603 // There are total 2 available base tags in this test file.
604 const int kTotalBaseTagCountInTestFile = 2;
606 // Since for this test, we assume there is no savable sub-resource links for
607 // this test file, also all links are relative URLs in this test file, so we
608 // need to check those relative URLs and make sure document has BASE tag.
609 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
610 ASSERT_TRUE(web_frame != NULL);
611 WebDocument doc = web_frame->document();
612 ASSERT_TRUE(doc.isHTMLDocument());
613 // Go through all descent nodes.
614 WebElementCollection all = doc.all();
615 int original_base_tag_count = 0;
616 for (WebElement element = all.firstItem(); !element.isNull();
617 element = all.nextItem()) {
618 if (element.hasHTMLTagName("base")) {
619 original_base_tag_count++;
620 } else {
621 // Get link.
622 WebString value = GetSubResourceLinkFromElement(element);
623 if (value.isNull() && element.hasHTMLTagName("a")) {
624 value = element.getAttribute("href");
625 if (value.isEmpty())
626 value = WebString();
628 // Each link is relative link.
629 if (!value.isNull()) {
630 GURL link(value.utf8());
631 ASSERT_TRUE(link.scheme().empty());
635 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
636 // Make sure in original document, the base URL is not equal with the
637 // |path_dir_url|.
638 GURL original_base_url(doc.baseURL());
639 ASSERT_NE(original_base_url, path_dir_url);
641 // Do serialization.
642 SerializeDomForURL(file_url, false);
644 // Load the serialized contents.
645 ASSERT_TRUE(HasSerializedFrame(file_url));
646 const std::string& serialized_contents =
647 GetSerializedContentForFrame(file_url);
648 LoadContents(serialized_contents, file_url,
649 web_frame->document().encoding());
651 // Make sure all links are absolute URLs and doc there are some number of
652 // BASE tags in serialized HTML data. Each of those BASE tags have same base
653 // URL which is as same as URL of current test file.
654 web_frame = GetMainFrame();
655 ASSERT_TRUE(web_frame != NULL);
656 doc = web_frame->document();
657 ASSERT_TRUE(doc.isHTMLDocument());
658 // Go through all descent nodes.
659 all = doc.all();
660 int new_base_tag_count = 0;
661 for (WebNode node = all.firstItem(); !node.isNull();
662 node = all.nextItem()) {
663 if (!node.isElementNode())
664 continue;
665 WebElement element = node.to<WebElement>();
666 if (element.hasHTMLTagName("base")) {
667 new_base_tag_count++;
668 } else {
669 // Get link.
670 WebString value = GetSubResourceLinkFromElement(element);
671 if (value.isNull() && element.hasHTMLTagName("a")) {
672 value = element.getAttribute("href");
673 if (value.isEmpty())
674 value = WebString();
676 // Each link is absolute link.
677 if (!value.isNull()) {
678 GURL link(std::string(value.utf8()));
679 ASSERT_FALSE(link.scheme().empty());
683 // We have one more added BASE tag which is generated by JavaScript.
684 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
685 // Make sure in new document, the base URL is equal with the |path_dir_url|.
686 GURL new_base_url(doc.baseURL());
687 ASSERT_EQ(new_base_url, path_dir_url);
690 void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
691 base::FilePath page_file_path = GetTestFilePath(
692 "dom_serializer", "empty_head.htm");
693 GURL file_url = net::FilePathToFileURL(page_file_path);
694 ASSERT_TRUE(file_url.SchemeIsFile());
696 // Load the test html content.
697 static const char* const empty_head_contents =
698 "<html><head></head><body>hello world</body></html>";
699 LoadContents(empty_head_contents, file_url, WebString());
701 // Make sure the head tag is empty.
702 WebFrame* web_frame = GetMainFrame();
703 ASSERT_TRUE(web_frame != NULL);
704 WebDocument doc = web_frame->document();
705 ASSERT_TRUE(doc.isHTMLDocument());
706 WebElement head_element = doc.head();
707 ASSERT_TRUE(!head_element.isNull());
708 ASSERT_TRUE(!head_element.hasChildNodes());
709 ASSERT_TRUE(head_element.childNodes().length() == 0);
711 // Do serialization.
712 SerializeDomForURL(file_url, false);
713 // Make sure the serialized contents have META ;
714 ASSERT_TRUE(HasSerializedFrame(file_url));
715 const std::string& serialized_contents =
716 GetSerializedContentForFrame(file_url);
718 // Reload serialized contents and make sure there is only one META tag.
719 LoadContents(serialized_contents, file_url,
720 web_frame->document().encoding());
721 web_frame = GetMainFrame();
722 ASSERT_TRUE(web_frame != NULL);
723 doc = web_frame->document();
724 ASSERT_TRUE(doc.isHTMLDocument());
725 head_element = doc.head();
726 ASSERT_TRUE(!head_element.isNull());
727 ASSERT_TRUE(head_element.hasChildNodes());
728 ASSERT_TRUE(head_element.childNodes().length() == 1);
729 WebNode meta_node = head_element.firstChild();
730 ASSERT_TRUE(!meta_node.isNull());
731 // Get meta charset info.
732 std::string charset_info;
733 ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
734 ASSERT_TRUE(!charset_info.empty());
735 ASSERT_EQ(charset_info,
736 std::string(web_frame->document().encoding().utf8()));
738 // Check the body's first node is text node and its contents are
739 // "hello world"
740 WebElement body_element = doc.body();
741 ASSERT_TRUE(!body_element.isNull());
742 WebNode text_node = body_element.firstChild();
743 ASSERT_TRUE(text_node.isTextNode());
744 WebString text_node_contents = text_node.nodeValue();
745 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
748 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) {
749 // Do a recursive serialization. We pass if we don't crash.
750 SerializeDomForURL(file_url, true);
753 void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
754 const GURL& file_url) {
755 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
756 ASSERT_TRUE(web_frame != NULL);
757 WebDocument doc = web_frame->document();
758 WebNode lastNodeInBody = doc.body().lastChild();
759 ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType());
760 WebString uri = GetSubResourceLinkFromElement(
761 lastNodeInBody.to<WebElement>());
762 EXPECT_TRUE(uri.isNull());
765 private:
766 // Map frame_url to corresponding serialized_content.
767 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
768 SerializedFrameContentMap serialized_frame_map_;
769 // Map frame_url to corresponding status of serialization finish.
770 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
771 SerializationFinishStatusMap serialization_finish_status_;
772 // Flag indicates whether the process of serializing DOM is finished or not.
773 bool serialized_;
774 // The local_directory_name_ is dummy relative path of directory which
775 // contain all saved auxiliary files included all sub frames and resources.
776 const base::FilePath local_directory_name_;
779 // If original contents have document type, the serialized contents also have
780 // document type.
781 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
782 base::FilePath page_file_path =
783 GetTestFilePath("dom_serializer", "youtube_1.htm");
784 GURL file_url = net::FilePathToFileURL(page_file_path);
785 ASSERT_TRUE(file_url.SchemeIsFile());
786 // Load the test file.
787 NavigateToURL(shell(), file_url);
789 PostTaskToInProcessRendererAndWait(
790 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer,
791 base::Unretained(this), file_url));
794 // If original contents do not have document type, the serialized contents
795 // also do not have document type.
796 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
797 base::FilePath page_file_path =
798 GetTestFilePath("dom_serializer", "youtube_2.htm");
799 GURL file_url = net::FilePathToFileURL(page_file_path);
800 ASSERT_TRUE(file_url.SchemeIsFile());
801 // Load the test file.
802 NavigateToURL(shell(), file_url);
804 PostTaskToInProcessRendererAndWait(
805 base::Bind(
806 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer,
807 base::Unretained(this), file_url));
810 // Serialize XML document which has all 5 built-in entities. After
811 // finishing serialization, the serialized contents should be same
812 // with original XML document.
814 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
815 // XML headers are handled differently in the merged serializer.
816 // Bug: http://crbug.com/328354
817 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
818 DISABLED_SerializeXMLDocWithBuiltInEntities) {
819 base::FilePath page_file_path =
820 GetTestFilePath("dom_serializer", "note.html");
821 base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml");
822 // Read original contents for later comparison.
823 std::string original_contents;
824 ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents));
825 // Get file URL.
826 GURL file_url = net::FilePathToFileURL(page_file_path);
827 GURL xml_file_url = net::FilePathToFileURL(xml_file_path);
828 ASSERT_TRUE(file_url.SchemeIsFile());
829 // Load the test file.
830 NavigateToURL(shell(), file_url);
832 PostTaskToInProcessRendererAndWait(
833 base::Bind(
834 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer,
835 base::Unretained(this), xml_file_url, original_contents));
838 // When serializing DOM, we add MOTW declaration before html tag.
839 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
840 base::FilePath page_file_path =
841 GetTestFilePath("dom_serializer", "youtube_2.htm");
842 // Read original contents for later comparison .
843 std::string original_contents;
844 ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents));
845 // Get file URL.
846 GURL file_url = net::FilePathToFileURL(page_file_path);
847 ASSERT_TRUE(file_url.SchemeIsFile());
849 // Load the test file.
850 NavigateToURL(shell(), file_url);
852 PostTaskToInProcessRendererAndWait(
853 base::Bind(
854 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer,
855 base::Unretained(this), file_url, original_contents));
858 // When serializing DOM, we will add the META which have correct charset
859 // declaration as first child of HEAD element for resolving WebKit bug:
860 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
861 // does not have META charset declaration.
862 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
863 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
864 base::FilePath page_file_path =
865 GetTestFilePath("dom_serializer", "youtube_1.htm");
866 // Get file URL.
867 GURL file_url = net::FilePathToFileURL(page_file_path);
868 ASSERT_TRUE(file_url.SchemeIsFile());
869 // Load the test file.
870 NavigateToURL(shell(), file_url);
872 PostTaskToInProcessRendererAndWait(
873 base::Bind(
874 &DomSerializerTests::
875 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer,
876 base::Unretained(this), file_url));
879 // When serializing DOM, if the original document has multiple META charset
880 // declaration, we will add the META which have correct charset declaration
881 // as first child of HEAD element and remove all original META charset
882 // declarations.
883 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
884 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
885 base::FilePath page_file_path =
886 GetTestFilePath("dom_serializer", "youtube_2.htm");
887 // Get file URL.
888 GURL file_url = net::FilePathToFileURL(page_file_path);
889 ASSERT_TRUE(file_url.SchemeIsFile());
890 // Load the test file.
891 NavigateToURL(shell(), file_url);
893 PostTaskToInProcessRendererAndWait(
894 base::Bind(
895 &DomSerializerTests::
896 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer,
897 base::Unretained(this), file_url));
900 // Test situation of html entities in text when serializing HTML DOM.
901 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
902 // Need to spin up the renderer and also navigate to a file url so that the
903 // renderer code doesn't attempt a fork when it sees a load to file scheme
904 // from non-file scheme.
905 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
907 PostTaskToInProcessRendererAndWait(
908 base::Bind(
909 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer,
910 base::Unretained(this)));
913 // Test situation of html entities in attribute value when serializing
914 // HTML DOM.
915 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
917 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
918 // Some attributes are handled differently in the merged serializer.
919 // Bug: http://crbug.com/328354
920 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
921 DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue) {
922 // Need to spin up the renderer and also navigate to a file url so that the
923 // renderer code doesn't attempt a fork when it sees a load to file scheme
924 // from non-file scheme.
925 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
927 PostTaskToInProcessRendererAndWait(
928 base::Bind(
929 &DomSerializerTests::
930 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer,
931 base::Unretained(this)));
934 // Test situation of non-standard HTML entities when serializing HTML DOM.
935 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
936 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
937 SerializeHTMLDOMWithNonStandardEntities) {
938 // Make a test file URL and load it.
939 base::FilePath page_file_path = GetTestFilePath(
940 "dom_serializer", "nonstandard_htmlentities.htm");
941 GURL file_url = net::FilePathToFileURL(page_file_path);
942 NavigateToURL(shell(), file_url);
944 PostTaskToInProcessRendererAndWait(
945 base::Bind(
946 &DomSerializerTests::
947 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer,
948 base::Unretained(this), file_url));
951 // Test situation of BASE tag in original document when serializing HTML DOM.
952 // When serializing, we should comment the BASE tag, append a new BASE tag.
953 // rewrite all the savable URLs to relative local path, and change other URLs
954 // to absolute URLs.
956 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
957 // Base tags are handled a bit different in merged version.
958 // Bug: http://crbug.com/328354
959 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
960 DISABLED_SerializeHTMLDOMWithBaseTag) {
961 base::FilePath page_file_path = GetTestFilePath(
962 "dom_serializer", "html_doc_has_base_tag.htm");
964 // Get page dir URL which is base URL of this file.
965 base::FilePath dir_name = page_file_path.DirName();
966 dir_name = dir_name.Append(
967 base::FilePath::StringType(base::FilePath::kSeparators[0], 1));
968 GURL path_dir_url = net::FilePathToFileURL(dir_name);
970 // Get file URL.
971 GURL file_url = net::FilePathToFileURL(page_file_path);
972 ASSERT_TRUE(file_url.SchemeIsFile());
973 // Load the test file.
974 NavigateToURL(shell(), file_url);
976 PostTaskToInProcessRendererAndWait(
977 base::Bind(
978 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer,
979 base::Unretained(this), file_url, path_dir_url));
982 // Serializing page which has an empty HEAD tag.
983 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
984 // Need to spin up the renderer and also navigate to a file url so that the
985 // renderer code doesn't attempt a fork when it sees a load to file scheme
986 // from non-file scheme.
987 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
989 PostTaskToInProcessRendererAndWait(
990 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer,
991 base::Unretained(this)));
994 // Test that we don't crash when the page contains an iframe that
995 // was handled as a download (http://crbug.com/42212).
996 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
997 SerializeDocumentWithDownloadedIFrame) {
998 base::FilePath page_file_path = GetTestFilePath(
999 "dom_serializer", "iframe-src-is-exe.htm");
1000 GURL file_url = net::FilePathToFileURL(page_file_path);
1001 ASSERT_TRUE(file_url.SchemeIsFile());
1002 // Load the test file.
1003 NavigateToURL(shell(), file_url);
1005 PostTaskToInProcessRendererAndWait(
1006 base::Bind(
1007 &DomSerializerTests::
1008 SerializeDocumentWithDownloadedIFrameOnRenderer,
1009 base::Unretained(this), file_url));
1012 IN_PROC_BROWSER_TEST_F(DomSerializerTests,
1013 SubResourceForElementsInNonHTMLNamespace) {
1014 base::FilePath page_file_path = GetTestFilePath(
1015 "dom_serializer", "non_html_namespace.htm");
1016 GURL file_url = net::FilePathToFileURL(page_file_path);
1017 NavigateToURL(shell(), file_url);
1019 PostTaskToInProcessRendererAndWait(
1020 base::Bind(
1021 &DomSerializerTests::
1022 SubResourceForElementsInNonHTMLNamespaceOnRenderer,
1023 base::Unretained(this), file_url));
1026 } // namespace content