1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "base/command_line.h"
7 #include "base/compiler_specific.h"
8 #include "base/containers/hash_tables.h"
9 #include "base/files/file_path.h"
10 #include "base/files/file_util.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "content/public/browser/render_view_host.h"
14 #include "content/public/browser/web_contents.h"
15 #include "content/public/common/content_switches.h"
16 #include "content/public/renderer/render_view.h"
17 #include "content/public/renderer/render_view_observer.h"
18 #include "content/public/test/content_browser_test.h"
19 #include "content/public/test/content_browser_test_utils.h"
20 #include "content/public/test/test_utils.h"
21 #include "content/renderer/savable_resources.h"
22 #include "content/shell/browser/shell.h"
23 #include "net/base/filename_util.h"
24 #include "net/url_request/url_request_context.h"
25 #include "third_party/WebKit/public/platform/WebCString.h"
26 #include "third_party/WebKit/public/platform/WebData.h"
27 #include "third_party/WebKit/public/platform/WebString.h"
28 #include "third_party/WebKit/public/platform/WebURL.h"
29 #include "third_party/WebKit/public/platform/WebVector.h"
30 #include "third_party/WebKit/public/web/WebDocument.h"
31 #include "third_party/WebKit/public/web/WebElement.h"
32 #include "third_party/WebKit/public/web/WebElementCollection.h"
33 #include "third_party/WebKit/public/web/WebLocalFrame.h"
34 #include "third_party/WebKit/public/web/WebNode.h"
35 #include "third_party/WebKit/public/web/WebNodeList.h"
36 #include "third_party/WebKit/public/web/WebPageSerializer.h"
37 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
38 #include "third_party/WebKit/public/web/WebView.h"
40 using blink::WebCString
;
42 using blink::WebDocument
;
43 using blink::WebElement
;
44 using blink::WebElementCollection
;
45 using blink::WebFrame
;
46 using blink::WebLocalFrame
;
48 using blink::WebNodeList
;
49 using blink::WebPageSerializer
;
50 using blink::WebPageSerializerClient
;
51 using blink::WebString
;
54 using blink::WebVector
;
58 // Iterate recursively over sub-frames to find one with with a given url.
59 WebFrame
* FindSubFrameByURL(WebView
* web_view
, const GURL
& url
) {
60 if (!web_view
->mainFrame())
63 std::vector
<WebFrame
*> stack
;
64 stack
.push_back(web_view
->mainFrame());
66 while (!stack
.empty()) {
67 WebFrame
* current_frame
= stack
.back();
69 if (GURL(current_frame
->document().url()) == url
)
71 WebElementCollection all
= current_frame
->document().all();
72 for (WebElement element
= all
.firstItem();
73 !element
.isNull(); element
= all
.nextItem()) {
74 // Check frame tag and iframe tag
75 if (!element
.hasHTMLTagName("frame") && !element
.hasHTMLTagName("iframe"))
77 WebFrame
* sub_frame
= WebLocalFrame::fromFrameOwnerElement(element
);
79 stack
.push_back(sub_frame
);
85 // Helper function that test whether the first node in the doc is a doc type
87 bool HasDocType(const WebDocument
& doc
) {
88 WebNode node
= doc
.firstChild();
91 return node
.nodeType() == WebNode::DocumentTypeNode
;
94 // Helper function for checking whether input node is META tag. Return true
95 // means it is META element, otherwise return false. The parameter charset_info
96 // return actual charset info if the META tag has charset declaration.
97 bool IsMetaElement(const WebNode
& node
, std::string
& charset_info
) {
98 if (!node
.isElementNode())
100 const WebElement meta
= node
.toConst
<WebElement
>();
101 if (!meta
.hasHTMLTagName("meta"))
103 charset_info
.erase(0, charset_info
.length());
104 // Check the META charset declaration.
105 WebString httpEquiv
= meta
.getAttribute("http-equiv");
106 if (base::LowerCaseEqualsASCII(base::StringPiece16(httpEquiv
),
108 std::string content
= meta
.getAttribute("content").utf8();
109 int pos
= content
.find("charset", 0);
111 // Add a dummy charset declaration to charset_info, which indicates this
112 // META tag has charset declaration although we do not get correct value
114 charset_info
.append("has-charset-declaration");
115 int remaining_length
= content
.length() - pos
- 7;
116 if (!remaining_length
)
118 int start_pos
= pos
+ 7;
120 while (remaining_length
--)
121 if (content
[start_pos
++] == L
'=')
123 // Skip beginning space.
124 while (remaining_length
) {
125 if (content
[start_pos
] > 0x0020)
130 if (!remaining_length
)
132 int end_pos
= start_pos
;
133 // Now we find out the start point of charset info. Search the end point.
134 while (remaining_length
--) {
135 if (content
[end_pos
] <= 0x0020 || content
[end_pos
] == L
';')
139 // Get actual charset info.
140 charset_info
= content
.substr(start_pos
, end_pos
- start_pos
);
147 class LoadObserver
: public RenderViewObserver
{
149 LoadObserver(RenderView
* render_view
, const base::Closure
& quit_closure
)
150 : RenderViewObserver(render_view
),
151 quit_closure_(quit_closure
) {}
153 void DidFinishLoad(blink::WebLocalFrame
* frame
) override
{
154 if (frame
== render_view()->GetWebView()->mainFrame())
159 base::Closure quit_closure_
;
162 class DomSerializerTests
: public ContentBrowserTest
,
163 public WebPageSerializerClient
{
166 : serialized_(false),
167 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
169 void SetUpCommandLine(base::CommandLine
* command_line
) override
{
170 command_line
->AppendSwitch(switches::kSingleProcess
);
172 // Don't want to try to create a GPU process.
173 command_line
->AppendSwitch(switches::kDisableGpu
);
177 void SetUpOnMainThread() override
{
178 render_view_routing_id_
=
179 shell()->web_contents()->GetRenderViewHost()->GetRoutingID();
182 // DomSerializerDelegate.
183 virtual void didSerializeDataForFrame(const WebURL
& frame_web_url
,
184 const WebCString
& data
,
185 PageSerializationStatus status
) {
187 GURL
frame_url(frame_web_url
);
188 // If the all frames are finished saving, check all finish status
189 if (status
== WebPageSerializerClient::AllFramesAreFinished
) {
190 SerializationFinishStatusMap::iterator it
=
191 serialization_finish_status_
.begin();
192 for (; it
!= serialization_finish_status_
.end(); ++it
)
193 ASSERT_TRUE(it
->second
);
198 // Check finish status of current frame.
199 SerializationFinishStatusMap::iterator it
=
200 serialization_finish_status_
.find(frame_url
.spec());
201 // New frame, set initial status as false.
202 if (it
== serialization_finish_status_
.end())
203 serialization_finish_status_
[frame_url
.spec()] = false;
205 it
= serialization_finish_status_
.find(frame_url
.spec());
206 ASSERT_TRUE(it
!= serialization_finish_status_
.end());
207 // In process frame, finish status should be false.
208 ASSERT_FALSE(it
->second
);
210 // Add data to corresponding frame's content.
211 serialized_frame_map_
[frame_url
.spec()] += data
.data();
213 // Current frame is completed saving, change the finish status.
214 if (status
== WebPageSerializerClient::CurrentFrameIsFinished
)
218 bool HasSerializedFrame(const GURL
& frame_url
) {
219 return serialized_frame_map_
.find(frame_url
.spec()) !=
220 serialized_frame_map_
.end();
223 const std::string
& GetSerializedContentForFrame(
224 const GURL
& frame_url
) {
225 return serialized_frame_map_
[frame_url
.spec()];
228 RenderView
* GetRenderView() {
229 return RenderView::FromRoutingID(render_view_routing_id_
);
232 WebView
* GetWebView() {
233 return GetRenderView()->GetWebView();
236 WebFrame
* GetMainFrame() {
237 return GetWebView()->mainFrame();
240 // Load web page according to input content and relative URLs within
242 void LoadContents(const std::string
& contents
,
243 const GURL
& base_url
,
244 const WebString encoding_info
) {
245 scoped_refptr
<MessageLoopRunner
> runner
= new MessageLoopRunner
;
246 LoadObserver
observer(GetRenderView(), runner
->QuitClosure());
248 // If input encoding is empty, use UTF-8 as default encoding.
249 if (encoding_info
.isEmpty()) {
250 GetMainFrame()->loadHTMLString(contents
, base_url
);
252 WebData
data(contents
.data(), contents
.length());
254 // Do not use WebFrame.LoadHTMLString because it assumes that input
255 // html contents use UTF-8 encoding.
256 // TODO(darin): This should use WebFrame::loadData.
257 WebFrame
* web_frame
= GetMainFrame();
259 ASSERT_TRUE(web_frame
!= NULL
);
261 web_frame
->loadData(data
, "text/html", encoding_info
, base_url
);
267 // Serialize page DOM according to specific page URL. The parameter
268 // recursive_serialization indicates whether we will serialize all
270 void SerializeDomForURL(const GURL
& page_url
,
271 bool recursive_serialization
) {
272 // Find corresponding WebFrame according to page_url.
273 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), page_url
);
274 ASSERT_TRUE(web_frame
!= NULL
);
275 WebVector
<WebURL
> links
;
276 links
.assign(&page_url
, 1);
277 WebString file_path
=
278 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
279 WebVector
<WebString
> local_paths
;
280 local_paths
.assign(&file_path
, 1);
281 // Start serializing DOM.
282 bool result
= WebPageSerializer::serialize(web_frame
->toWebLocalFrame(),
283 recursive_serialization
,
284 static_cast<WebPageSerializerClient
*>(this),
287 local_directory_name_
.AsUTF16Unsafe());
289 ASSERT_TRUE(serialized_
);
292 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL
& file_url
) {
293 // Make sure original contents have document type.
294 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
295 ASSERT_TRUE(web_frame
!= NULL
);
296 WebDocument doc
= web_frame
->document();
297 ASSERT_TRUE(HasDocType(doc
));
299 SerializeDomForURL(file_url
, false);
300 // Load the serialized contents.
301 ASSERT_TRUE(HasSerializedFrame(file_url
));
302 const std::string
& serialized_contents
=
303 GetSerializedContentForFrame(file_url
);
304 LoadContents(serialized_contents
, file_url
,
305 web_frame
->document().encoding());
306 // Make sure serialized contents still have document type.
307 web_frame
= GetMainFrame();
308 doc
= web_frame
->document();
309 ASSERT_TRUE(HasDocType(doc
));
312 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL
& file_url
) {
313 // Make sure original contents do not have document type.
314 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
315 ASSERT_TRUE(web_frame
!= NULL
);
316 WebDocument doc
= web_frame
->document();
317 ASSERT_TRUE(!HasDocType(doc
));
319 SerializeDomForURL(file_url
, false);
320 // Load the serialized contents.
321 ASSERT_TRUE(HasSerializedFrame(file_url
));
322 const std::string
& serialized_contents
=
323 GetSerializedContentForFrame(file_url
);
324 LoadContents(serialized_contents
, file_url
,
325 web_frame
->document().encoding());
326 // Make sure serialized contents do not have document type.
327 web_frame
= GetMainFrame();
328 doc
= web_frame
->document();
329 ASSERT_TRUE(!HasDocType(doc
));
332 void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
333 const GURL
& xml_file_url
, const std::string
& original_contents
) {
335 SerializeDomForURL(xml_file_url
, false);
336 // Compare the serialized contents with original contents.
337 ASSERT_TRUE(HasSerializedFrame(xml_file_url
));
338 const std::string
& serialized_contents
=
339 GetSerializedContentForFrame(xml_file_url
);
340 ASSERT_EQ(original_contents
, serialized_contents
);
343 void SerializeHTMLDOMWithAddingMOTWOnRenderer(
344 const GURL
& file_url
, const std::string
& original_contents
) {
345 // Make sure original contents does not have MOTW;
346 std::string motw_declaration
=
347 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
348 ASSERT_FALSE(motw_declaration
.empty());
349 // The encoding of original contents is ISO-8859-1, so we convert the MOTW
350 // declaration to ASCII and search whether original contents has it or not.
351 ASSERT_TRUE(std::string::npos
== original_contents
.find(motw_declaration
));
354 SerializeDomForURL(file_url
, false);
355 // Make sure the serialized contents have MOTW ;
356 ASSERT_TRUE(HasSerializedFrame(file_url
));
357 const std::string
& serialized_contents
=
358 GetSerializedContentForFrame(file_url
);
359 ASSERT_FALSE(std::string::npos
==
360 serialized_contents
.find(motw_declaration
));
363 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
364 const GURL
& file_url
) {
365 // Make sure there is no META charset declaration in original document.
366 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
367 ASSERT_TRUE(web_frame
!= NULL
);
368 WebDocument doc
= web_frame
->document();
369 ASSERT_TRUE(doc
.isHTMLDocument());
370 WebElement head_element
= doc
.head();
371 ASSERT_TRUE(!head_element
.isNull());
372 // Go through all children of HEAD element.
373 for (WebNode child
= head_element
.firstChild(); !child
.isNull();
374 child
= child
.nextSibling()) {
375 std::string charset_info
;
376 if (IsMetaElement(child
, charset_info
))
377 ASSERT_TRUE(charset_info
.empty());
380 SerializeDomForURL(file_url
, false);
382 // Load the serialized contents.
383 ASSERT_TRUE(HasSerializedFrame(file_url
));
384 const std::string
& serialized_contents
=
385 GetSerializedContentForFrame(file_url
);
386 LoadContents(serialized_contents
, file_url
,
387 web_frame
->document().encoding());
388 // Make sure the first child of HEAD element is META which has charset
389 // declaration in serialized contents.
390 web_frame
= GetMainFrame();
391 ASSERT_TRUE(web_frame
!= NULL
);
392 doc
= web_frame
->document();
393 ASSERT_TRUE(doc
.isHTMLDocument());
394 head_element
= doc
.head();
395 ASSERT_TRUE(!head_element
.isNull());
396 WebNode meta_node
= head_element
.firstChild();
397 ASSERT_TRUE(!meta_node
.isNull());
398 // Get meta charset info.
399 std::string charset_info2
;
400 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info2
));
401 ASSERT_TRUE(!charset_info2
.empty());
402 ASSERT_EQ(charset_info2
,
403 std::string(web_frame
->document().encoding().utf8()));
405 // Make sure no more additional META tags which have charset declaration.
406 for (WebNode child
= meta_node
.nextSibling(); !child
.isNull();
407 child
= child
.nextSibling()) {
408 std::string charset_info
;
409 if (IsMetaElement(child
, charset_info
))
410 ASSERT_TRUE(charset_info
.empty());
414 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
415 const GURL
& file_url
) {
416 // Make sure there are multiple META charset declarations in original
418 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
419 ASSERT_TRUE(web_frame
!= NULL
);
420 WebDocument doc
= web_frame
->document();
421 ASSERT_TRUE(doc
.isHTMLDocument());
422 WebElement head_ele
= doc
.head();
423 ASSERT_TRUE(!head_ele
.isNull());
424 // Go through all children of HEAD element.
425 int charset_declaration_count
= 0;
426 for (WebNode child
= head_ele
.firstChild(); !child
.isNull();
427 child
= child
.nextSibling()) {
428 std::string charset_info
;
429 if (IsMetaElement(child
, charset_info
) && !charset_info
.empty())
430 charset_declaration_count
++;
432 // The original doc has more than META tags which have charset declaration.
433 ASSERT_TRUE(charset_declaration_count
> 1);
436 SerializeDomForURL(file_url
, false);
438 // Load the serialized contents.
439 ASSERT_TRUE(HasSerializedFrame(file_url
));
440 const std::string
& serialized_contents
=
441 GetSerializedContentForFrame(file_url
);
442 LoadContents(serialized_contents
, file_url
,
443 web_frame
->document().encoding());
444 // Make sure only first child of HEAD element is META which has charset
445 // declaration in serialized contents.
446 web_frame
= GetMainFrame();
447 ASSERT_TRUE(web_frame
!= NULL
);
448 doc
= web_frame
->document();
449 ASSERT_TRUE(doc
.isHTMLDocument());
450 head_ele
= doc
.head();
451 ASSERT_TRUE(!head_ele
.isNull());
452 WebNode meta_node
= head_ele
.firstChild();
453 ASSERT_TRUE(!meta_node
.isNull());
454 // Get meta charset info.
455 std::string charset_info2
;
456 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info2
));
457 ASSERT_TRUE(!charset_info2
.empty());
458 ASSERT_EQ(charset_info2
,
459 std::string(web_frame
->document().encoding().utf8()));
461 // Make sure no more additional META tags which have charset declaration.
462 for (WebNode child
= meta_node
.nextSibling(); !child
.isNull();
463 child
= child
.nextSibling()) {
464 std::string charset_info
;
465 if (IsMetaElement(child
, charset_info
))
466 ASSERT_TRUE(charset_info
.empty());
470 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
471 base::FilePath page_file_path
= GetTestFilePath(
472 "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
473 // Get file URL. The URL is dummy URL to identify the following loading
474 // actions. The test content is in constant:original_contents.
475 GURL file_url
= net::FilePathToFileURL(page_file_path
);
476 ASSERT_TRUE(file_url
.SchemeIsFile());
478 static const char* const original_contents
=
479 "<html><body>&<>\"\'</body></html>";
480 // Load the test contents.
481 LoadContents(original_contents
, file_url
, WebString());
483 // Get BODY's text content in DOM.
484 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
485 ASSERT_TRUE(web_frame
!= NULL
);
486 WebDocument doc
= web_frame
->document();
487 ASSERT_TRUE(doc
.isHTMLDocument());
488 WebElement body_ele
= doc
.body();
489 ASSERT_TRUE(!body_ele
.isNull());
490 WebNode text_node
= body_ele
.firstChild();
491 ASSERT_TRUE(text_node
.isTextNode());
492 ASSERT_TRUE(std::string(text_node
.createMarkup().utf8()) ==
493 "&<>\"\'");
495 SerializeDomForURL(file_url
, false);
496 // Compare the serialized contents with original contents.
497 ASSERT_TRUE(HasSerializedFrame(file_url
));
498 const std::string
& serialized_contents
=
499 GetSerializedContentForFrame(file_url
);
500 // Compare the serialized contents with original contents to make sure
502 // Because we add MOTW when serializing DOM, so before comparison, we also
503 // need to add MOTW to original_contents.
504 std::string original_str
=
505 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
506 original_str
+= original_contents
;
507 // Since WebCore now inserts a new HEAD element if there is no HEAD element
508 // when creating BODY element. (Please see
509 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
510 // corresponding META content if we find WebCore-generated HEAD element.
511 if (!doc
.head().isNull()) {
512 WebString encoding
= web_frame
->document().encoding();
513 std::string
htmlTag("<html>");
514 std::string::size_type pos
= original_str
.find(htmlTag
);
515 ASSERT_NE(std::string::npos
, pos
);
516 pos
+= htmlTag
.length();
517 std::string
head_part("<head>");
519 WebPageSerializer::generateMetaCharsetDeclaration(encoding
).utf8();
520 head_part
+= "</head>";
521 original_str
.insert(pos
, head_part
);
523 ASSERT_EQ(original_str
, serialized_contents
);
526 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
527 base::FilePath page_file_path
= GetTestFilePath(
528 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
529 // Get file URL. The URL is dummy URL to identify the following loading
530 // actions. The test content is in constant:original_contents.
531 GURL file_url
= net::FilePathToFileURL(page_file_path
);
532 ASSERT_TRUE(file_url
.SchemeIsFile());
534 static const char* const original_contents
=
535 "<html><body title=\"&<>"'\"></body></html>";
536 // Load the test contents.
537 LoadContents(original_contents
, file_url
, WebString());
538 // Get value of BODY's title attribute in DOM.
539 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
540 ASSERT_TRUE(web_frame
!= NULL
);
541 WebDocument doc
= web_frame
->document();
542 ASSERT_TRUE(doc
.isHTMLDocument());
543 WebElement body_ele
= doc
.body();
544 ASSERT_TRUE(!body_ele
.isNull());
545 WebString value
= body_ele
.getAttribute("title");
546 ASSERT_TRUE(std::string(value
.utf8()) == "&<>\"\'");
548 SerializeDomForURL(file_url
, false);
549 // Compare the serialized contents with original contents.
550 ASSERT_TRUE(HasSerializedFrame(file_url
));
551 const std::string
& serialized_contents
=
552 GetSerializedContentForFrame(file_url
);
553 // Compare the serialized contents with original contents to make sure
555 std::string original_str
=
556 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
557 original_str
+= original_contents
;
559 WebString encoding
= web_frame
->document().encoding();
560 std::string
htmlTag("<html>");
561 std::string::size_type pos
= original_str
.find(htmlTag
);
562 ASSERT_NE(std::string::npos
, pos
);
563 pos
+= htmlTag
.length();
564 std::string
head_part("<head>");
566 WebPageSerializer::generateMetaCharsetDeclaration(encoding
).utf8();
567 head_part
+= "</head>";
568 original_str
.insert(pos
, head_part
);
570 ASSERT_EQ(original_str
, serialized_contents
);
573 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL
& file_url
) {
574 // Get value of BODY's title attribute in DOM.
575 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
576 WebDocument doc
= web_frame
->document();
577 ASSERT_TRUE(doc
.isHTMLDocument());
578 WebElement body_element
= doc
.body();
579 // Unescaped string for "%⊅¹'".
580 static const wchar_t parsed_value
[] = {
581 '%', 0x2285, 0x00b9, '\'', 0
583 WebString value
= body_element
.getAttribute("title");
584 WebString content
= doc
.contentAsTextForTesting();
585 ASSERT_TRUE(base::UTF16ToWide(value
) == parsed_value
);
586 ASSERT_TRUE(base::UTF16ToWide(content
) == parsed_value
);
589 SerializeDomForURL(file_url
, false);
590 // Check the serialized string.
591 ASSERT_TRUE(HasSerializedFrame(file_url
));
592 const std::string
& serialized_contents
=
593 GetSerializedContentForFrame(file_url
);
594 // Confirm that the serialized string has no non-standard HTML entities.
595 ASSERT_EQ(std::string::npos
, serialized_contents
.find("%"));
596 ASSERT_EQ(std::string::npos
, serialized_contents
.find("⊅"));
597 ASSERT_EQ(std::string::npos
, serialized_contents
.find("¹"));
598 ASSERT_EQ(std::string::npos
, serialized_contents
.find("'"));
601 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL
& file_url
,
602 const GURL
& path_dir_url
) {
603 // There are total 2 available base tags in this test file.
604 const int kTotalBaseTagCountInTestFile
= 2;
606 // Since for this test, we assume there is no savable sub-resource links for
607 // this test file, also all links are relative URLs in this test file, so we
608 // need to check those relative URLs and make sure document has BASE tag.
609 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
610 ASSERT_TRUE(web_frame
!= NULL
);
611 WebDocument doc
= web_frame
->document();
612 ASSERT_TRUE(doc
.isHTMLDocument());
613 // Go through all descent nodes.
614 WebElementCollection all
= doc
.all();
615 int original_base_tag_count
= 0;
616 for (WebElement element
= all
.firstItem(); !element
.isNull();
617 element
= all
.nextItem()) {
618 if (element
.hasHTMLTagName("base")) {
619 original_base_tag_count
++;
622 WebString value
= GetSubResourceLinkFromElement(element
);
623 if (value
.isNull() && element
.hasHTMLTagName("a")) {
624 value
= element
.getAttribute("href");
628 // Each link is relative link.
629 if (!value
.isNull()) {
630 GURL
link(value
.utf8());
631 ASSERT_TRUE(link
.scheme().empty());
635 ASSERT_EQ(original_base_tag_count
, kTotalBaseTagCountInTestFile
);
636 // Make sure in original document, the base URL is not equal with the
638 GURL
original_base_url(doc
.baseURL());
639 ASSERT_NE(original_base_url
, path_dir_url
);
642 SerializeDomForURL(file_url
, false);
644 // Load the serialized contents.
645 ASSERT_TRUE(HasSerializedFrame(file_url
));
646 const std::string
& serialized_contents
=
647 GetSerializedContentForFrame(file_url
);
648 LoadContents(serialized_contents
, file_url
,
649 web_frame
->document().encoding());
651 // Make sure all links are absolute URLs and doc there are some number of
652 // BASE tags in serialized HTML data. Each of those BASE tags have same base
653 // URL which is as same as URL of current test file.
654 web_frame
= GetMainFrame();
655 ASSERT_TRUE(web_frame
!= NULL
);
656 doc
= web_frame
->document();
657 ASSERT_TRUE(doc
.isHTMLDocument());
658 // Go through all descent nodes.
660 int new_base_tag_count
= 0;
661 for (WebNode node
= all
.firstItem(); !node
.isNull();
662 node
= all
.nextItem()) {
663 if (!node
.isElementNode())
665 WebElement element
= node
.to
<WebElement
>();
666 if (element
.hasHTMLTagName("base")) {
667 new_base_tag_count
++;
670 WebString value
= GetSubResourceLinkFromElement(element
);
671 if (value
.isNull() && element
.hasHTMLTagName("a")) {
672 value
= element
.getAttribute("href");
676 // Each link is absolute link.
677 if (!value
.isNull()) {
678 GURL
link(std::string(value
.utf8()));
679 ASSERT_FALSE(link
.scheme().empty());
683 // We have one more added BASE tag which is generated by JavaScript.
684 ASSERT_EQ(new_base_tag_count
, original_base_tag_count
+ 1);
685 // Make sure in new document, the base URL is equal with the |path_dir_url|.
686 GURL
new_base_url(doc
.baseURL());
687 ASSERT_EQ(new_base_url
, path_dir_url
);
690 void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
691 base::FilePath page_file_path
= GetTestFilePath(
692 "dom_serializer", "empty_head.htm");
693 GURL file_url
= net::FilePathToFileURL(page_file_path
);
694 ASSERT_TRUE(file_url
.SchemeIsFile());
696 // Load the test html content.
697 static const char* const empty_head_contents
=
698 "<html><head></head><body>hello world</body></html>";
699 LoadContents(empty_head_contents
, file_url
, WebString());
701 // Make sure the head tag is empty.
702 WebFrame
* web_frame
= GetMainFrame();
703 ASSERT_TRUE(web_frame
!= NULL
);
704 WebDocument doc
= web_frame
->document();
705 ASSERT_TRUE(doc
.isHTMLDocument());
706 WebElement head_element
= doc
.head();
707 ASSERT_TRUE(!head_element
.isNull());
708 ASSERT_TRUE(!head_element
.hasChildNodes());
709 ASSERT_TRUE(head_element
.childNodes().length() == 0);
712 SerializeDomForURL(file_url
, false);
713 // Make sure the serialized contents have META ;
714 ASSERT_TRUE(HasSerializedFrame(file_url
));
715 const std::string
& serialized_contents
=
716 GetSerializedContentForFrame(file_url
);
718 // Reload serialized contents and make sure there is only one META tag.
719 LoadContents(serialized_contents
, file_url
,
720 web_frame
->document().encoding());
721 web_frame
= GetMainFrame();
722 ASSERT_TRUE(web_frame
!= NULL
);
723 doc
= web_frame
->document();
724 ASSERT_TRUE(doc
.isHTMLDocument());
725 head_element
= doc
.head();
726 ASSERT_TRUE(!head_element
.isNull());
727 ASSERT_TRUE(head_element
.hasChildNodes());
728 ASSERT_TRUE(head_element
.childNodes().length() == 1);
729 WebNode meta_node
= head_element
.firstChild();
730 ASSERT_TRUE(!meta_node
.isNull());
731 // Get meta charset info.
732 std::string charset_info
;
733 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info
));
734 ASSERT_TRUE(!charset_info
.empty());
735 ASSERT_EQ(charset_info
,
736 std::string(web_frame
->document().encoding().utf8()));
738 // Check the body's first node is text node and its contents are
740 WebElement body_element
= doc
.body();
741 ASSERT_TRUE(!body_element
.isNull());
742 WebNode text_node
= body_element
.firstChild();
743 ASSERT_TRUE(text_node
.isTextNode());
744 WebString text_node_contents
= text_node
.nodeValue();
745 ASSERT_TRUE(std::string(text_node_contents
.utf8()) == "hello world");
748 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL
& file_url
) {
749 // Do a recursive serialization. We pass if we don't crash.
750 SerializeDomForURL(file_url
, true);
753 void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
754 const GURL
& file_url
) {
755 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
756 ASSERT_TRUE(web_frame
!= NULL
);
757 WebDocument doc
= web_frame
->document();
758 WebNode lastNodeInBody
= doc
.body().lastChild();
759 ASSERT_EQ(WebNode::ElementNode
, lastNodeInBody
.nodeType());
760 WebString uri
= GetSubResourceLinkFromElement(
761 lastNodeInBody
.to
<WebElement
>());
762 EXPECT_TRUE(uri
.isNull());
766 int32 render_view_routing_id_
;
767 // Map frame_url to corresponding serialized_content.
768 typedef base::hash_map
<std::string
, std::string
> SerializedFrameContentMap
;
769 SerializedFrameContentMap serialized_frame_map_
;
770 // Map frame_url to corresponding status of serialization finish.
771 typedef base::hash_map
<std::string
, bool> SerializationFinishStatusMap
;
772 SerializationFinishStatusMap serialization_finish_status_
;
773 // Flag indicates whether the process of serializing DOM is finished or not.
775 // The local_directory_name_ is dummy relative path of directory which
776 // contain all saved auxiliary files included all sub frames and resources.
777 const base::FilePath local_directory_name_
;
780 // If original contents have document type, the serialized contents also have
782 // Disabled by ellyjones@ on 2015-05-18, see https://crbug.com/488495.
783 #if defined(OS_MACOSX)
784 #define MAYBE_SerializeHTMLDOMWithDocType DISABLED_SerializeHTMLDOMWithDocType
786 #define MAYBE_SerializeHTMLDOMWithDocType SerializeHTMLDOMWithDocType
789 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
790 MAYBE_SerializeHTMLDOMWithDocType
) {
791 base::FilePath page_file_path
=
792 GetTestFilePath("dom_serializer", "youtube_1.htm");
793 GURL file_url
= net::FilePathToFileURL(page_file_path
);
794 ASSERT_TRUE(file_url
.SchemeIsFile());
795 // Load the test file.
796 NavigateToURL(shell(), file_url
);
798 PostTaskToInProcessRendererAndWait(
799 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer
,
800 base::Unretained(this), file_url
));
803 // If original contents do not have document type, the serialized contents
804 // also do not have document type.
805 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithoutDocType
) {
806 base::FilePath page_file_path
=
807 GetTestFilePath("dom_serializer", "youtube_2.htm");
808 GURL file_url
= net::FilePathToFileURL(page_file_path
);
809 ASSERT_TRUE(file_url
.SchemeIsFile());
810 // Load the test file.
811 NavigateToURL(shell(), file_url
);
813 PostTaskToInProcessRendererAndWait(
815 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer
,
816 base::Unretained(this), file_url
));
819 // Serialize XML document which has all 5 built-in entities. After
820 // finishing serialization, the serialized contents should be same
821 // with original XML document.
823 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
824 // XML headers are handled differently in the merged serializer.
825 // Bug: http://crbug.com/328354
826 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
827 DISABLED_SerializeXMLDocWithBuiltInEntities
) {
828 base::FilePath page_file_path
=
829 GetTestFilePath("dom_serializer", "note.html");
830 base::FilePath xml_file_path
= GetTestFilePath("dom_serializer", "note.xml");
831 // Read original contents for later comparison.
832 std::string original_contents
;
833 ASSERT_TRUE(base::ReadFileToString(xml_file_path
, &original_contents
));
835 GURL file_url
= net::FilePathToFileURL(page_file_path
);
836 GURL xml_file_url
= net::FilePathToFileURL(xml_file_path
);
837 ASSERT_TRUE(file_url
.SchemeIsFile());
838 // Load the test file.
839 NavigateToURL(shell(), file_url
);
841 PostTaskToInProcessRendererAndWait(
843 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer
,
844 base::Unretained(this), xml_file_url
, original_contents
));
847 // When serializing DOM, we add MOTW declaration before html tag.
848 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithAddingMOTW
) {
849 base::FilePath page_file_path
=
850 GetTestFilePath("dom_serializer", "youtube_2.htm");
851 // Read original contents for later comparison .
852 std::string original_contents
;
853 ASSERT_TRUE(base::ReadFileToString(page_file_path
, &original_contents
));
855 GURL file_url
= net::FilePathToFileURL(page_file_path
);
856 ASSERT_TRUE(file_url
.SchemeIsFile());
858 // Load the test file.
859 NavigateToURL(shell(), file_url
);
861 PostTaskToInProcessRendererAndWait(
863 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer
,
864 base::Unretained(this), file_url
, original_contents
));
867 // When serializing DOM, we will add the META which have correct charset
868 // declaration as first child of HEAD element for resolving WebKit bug:
869 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
870 // does not have META charset declaration.
871 // Disabled by battre@ on 2015-05-21, see https://crbug.com/488495.
872 #if defined(OS_MACOSX)
873 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
874 DISABLED_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
876 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
877 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
879 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
880 MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
) {
881 base::FilePath page_file_path
=
882 GetTestFilePath("dom_serializer", "youtube_1.htm");
884 GURL file_url
= net::FilePathToFileURL(page_file_path
);
885 ASSERT_TRUE(file_url
.SchemeIsFile());
886 // Load the test file.
887 NavigateToURL(shell(), file_url
);
889 PostTaskToInProcessRendererAndWait(
891 &DomSerializerTests::
892 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer
,
893 base::Unretained(this), file_url
));
896 // When serializing DOM, if the original document has multiple META charset
897 // declaration, we will add the META which have correct charset declaration
898 // as first child of HEAD element and remove all original META charset
900 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
901 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc
) {
902 base::FilePath page_file_path
=
903 GetTestFilePath("dom_serializer", "youtube_2.htm");
905 GURL file_url
= net::FilePathToFileURL(page_file_path
);
906 ASSERT_TRUE(file_url
.SchemeIsFile());
907 // Load the test file.
908 NavigateToURL(shell(), file_url
);
910 PostTaskToInProcessRendererAndWait(
912 &DomSerializerTests::
913 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer
,
914 base::Unretained(this), file_url
));
917 // Test situation of html entities in text when serializing HTML DOM.
918 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithEntitiesInText
) {
919 // Need to spin up the renderer and also navigate to a file url so that the
920 // renderer code doesn't attempt a fork when it sees a load to file scheme
921 // from non-file scheme.
922 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
924 PostTaskToInProcessRendererAndWait(
926 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer
,
927 base::Unretained(this)));
930 // Test situation of html entities in attribute value when serializing
932 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
934 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
935 // Some attributes are handled differently in the merged serializer.
936 // Bug: http://crbug.com/328354
937 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
938 DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue
) {
939 // Need to spin up the renderer and also navigate to a file url so that the
940 // renderer code doesn't attempt a fork when it sees a load to file scheme
941 // from non-file scheme.
942 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
944 PostTaskToInProcessRendererAndWait(
946 &DomSerializerTests::
947 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer
,
948 base::Unretained(this)));
951 // Test situation of non-standard HTML entities when serializing HTML DOM.
952 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
953 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
954 SerializeHTMLDOMWithNonStandardEntities
) {
955 // Make a test file URL and load it.
956 base::FilePath page_file_path
= GetTestFilePath(
957 "dom_serializer", "nonstandard_htmlentities.htm");
958 GURL file_url
= net::FilePathToFileURL(page_file_path
);
959 NavigateToURL(shell(), file_url
);
961 PostTaskToInProcessRendererAndWait(
963 &DomSerializerTests::
964 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer
,
965 base::Unretained(this), file_url
));
968 // Test situation of BASE tag in original document when serializing HTML DOM.
969 // When serializing, we should comment the BASE tag, append a new BASE tag.
970 // rewrite all the savable URLs to relative local path, and change other URLs
973 // TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
974 // Base tags are handled a bit different in merged version.
975 // Bug: http://crbug.com/328354
976 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
977 DISABLED_SerializeHTMLDOMWithBaseTag
) {
978 base::FilePath page_file_path
= GetTestFilePath(
979 "dom_serializer", "html_doc_has_base_tag.htm");
981 // Get page dir URL which is base URL of this file.
982 base::FilePath dir_name
= page_file_path
.DirName();
983 dir_name
= dir_name
.Append(
984 base::FilePath::StringType(base::FilePath::kSeparators
[0], 1));
985 GURL path_dir_url
= net::FilePathToFileURL(dir_name
);
988 GURL file_url
= net::FilePathToFileURL(page_file_path
);
989 ASSERT_TRUE(file_url
.SchemeIsFile());
990 // Load the test file.
991 NavigateToURL(shell(), file_url
);
993 PostTaskToInProcessRendererAndWait(
995 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer
,
996 base::Unretained(this), file_url
, path_dir_url
));
999 // Serializing page which has an empty HEAD tag.
1000 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithEmptyHead
) {
1001 // Need to spin up the renderer and also navigate to a file url so that the
1002 // renderer code doesn't attempt a fork when it sees a load to file scheme
1003 // from non-file scheme.
1004 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
1006 PostTaskToInProcessRendererAndWait(
1007 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer
,
1008 base::Unretained(this)));
1011 // Test that we don't crash when the page contains an iframe that
1012 // was handled as a download (http://crbug.com/42212).
1013 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
1014 SerializeDocumentWithDownloadedIFrame
) {
1015 base::FilePath page_file_path
= GetTestFilePath(
1016 "dom_serializer", "iframe-src-is-exe.htm");
1017 GURL file_url
= net::FilePathToFileURL(page_file_path
);
1018 ASSERT_TRUE(file_url
.SchemeIsFile());
1019 // Load the test file.
1020 NavigateToURL(shell(), file_url
);
1022 PostTaskToInProcessRendererAndWait(
1024 &DomSerializerTests::
1025 SerializeDocumentWithDownloadedIFrameOnRenderer
,
1026 base::Unretained(this), file_url
));
1029 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
1030 SubResourceForElementsInNonHTMLNamespace
) {
1031 base::FilePath page_file_path
= GetTestFilePath(
1032 "dom_serializer", "non_html_namespace.htm");
1033 GURL file_url
= net::FilePathToFileURL(page_file_path
);
1034 NavigateToURL(shell(), file_url
);
1036 PostTaskToInProcessRendererAndWait(
1038 &DomSerializerTests::
1039 SubResourceForElementsInNonHTMLNamespaceOnRenderer
,
1040 base::Unretained(this), file_url
));
1043 } // namespace content