1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "base/command_line.h"
7 #include "base/compiler_specific.h"
8 #include "base/containers/hash_tables.h"
9 #include "base/files/file_path.h"
10 #include "base/files/file_util.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "content/public/common/content_switches.h"
14 #include "content/public/renderer/render_view.h"
15 #include "content/public/renderer/render_view_observer.h"
16 #include "content/public/test/content_browser_test.h"
17 #include "content/public/test/content_browser_test_utils.h"
18 #include "content/public/test/test_utils.h"
19 #include "content/renderer/savable_resources.h"
20 #include "content/shell/browser/shell.h"
21 #include "net/base/filename_util.h"
22 #include "net/url_request/url_request_context.h"
23 #include "third_party/WebKit/public/platform/WebCString.h"
24 #include "third_party/WebKit/public/platform/WebData.h"
25 #include "third_party/WebKit/public/platform/WebString.h"
26 #include "third_party/WebKit/public/platform/WebURL.h"
27 #include "third_party/WebKit/public/platform/WebVector.h"
28 #include "third_party/WebKit/public/web/WebDocument.h"
29 #include "third_party/WebKit/public/web/WebElement.h"
30 #include "third_party/WebKit/public/web/WebElementCollection.h"
31 #include "third_party/WebKit/public/web/WebLocalFrame.h"
32 #include "third_party/WebKit/public/web/WebNode.h"
33 #include "third_party/WebKit/public/web/WebNodeList.h"
34 #include "third_party/WebKit/public/web/WebPageSerializer.h"
35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h"
36 #include "third_party/WebKit/public/web/WebView.h"
38 using blink::WebCString
;
40 using blink::WebDocument
;
41 using blink::WebElement
;
42 using blink::WebElementCollection
;
43 using blink::WebFrame
;
44 using blink::WebLocalFrame
;
46 using blink::WebNodeList
;
47 using blink::WebPageSerializer
;
48 using blink::WebPageSerializerClient
;
49 using blink::WebString
;
52 using blink::WebVector
;
56 // The first RenderFrame is routing ID 1, and the first RenderView is 2.
57 const int kRenderViewRoutingId
= 2;
63 // Iterate recursively over sub-frames to find one with with a given url.
64 WebFrame
* FindSubFrameByURL(WebView
* web_view
, const GURL
& url
) {
65 if (!web_view
->mainFrame())
68 std::vector
<WebFrame
*> stack
;
69 stack
.push_back(web_view
->mainFrame());
71 while (!stack
.empty()) {
72 WebFrame
* current_frame
= stack
.back();
74 if (GURL(current_frame
->document().url()) == url
)
76 WebElementCollection all
= current_frame
->document().all();
77 for (WebElement element
= all
.firstItem();
78 !element
.isNull(); element
= all
.nextItem()) {
79 // Check frame tag and iframe tag
80 if (!element
.hasHTMLTagName("frame") && !element
.hasHTMLTagName("iframe"))
82 WebFrame
* sub_frame
= WebLocalFrame::fromFrameOwnerElement(element
);
84 stack
.push_back(sub_frame
);
90 // Helper function that test whether the first node in the doc is a doc type
92 bool HasDocType(const WebDocument
& doc
) {
93 WebNode node
= doc
.firstChild();
96 return node
.nodeType() == WebNode::DocumentTypeNode
;
99 // Helper function for checking whether input node is META tag. Return true
100 // means it is META element, otherwise return false. The parameter charset_info
101 // return actual charset info if the META tag has charset declaration.
102 bool IsMetaElement(const WebNode
& node
, std::string
& charset_info
) {
103 if (!node
.isElementNode())
105 const WebElement meta
= node
.toConst
<WebElement
>();
106 if (!meta
.hasHTMLTagName("meta"))
108 charset_info
.erase(0, charset_info
.length());
109 // Check the META charset declaration.
110 WebString httpEquiv
= meta
.getAttribute("http-equiv");
111 if (base::LowerCaseEqualsASCII(httpEquiv
, "content-type")) {
112 std::string content
= meta
.getAttribute("content").utf8();
113 int pos
= content
.find("charset", 0);
115 // Add a dummy charset declaration to charset_info, which indicates this
116 // META tag has charset declaration although we do not get correct value
118 charset_info
.append("has-charset-declaration");
119 int remaining_length
= content
.length() - pos
- 7;
120 if (!remaining_length
)
122 int start_pos
= pos
+ 7;
124 while (remaining_length
--)
125 if (content
[start_pos
++] == L
'=')
127 // Skip beginning space.
128 while (remaining_length
) {
129 if (content
[start_pos
] > 0x0020)
134 if (!remaining_length
)
136 int end_pos
= start_pos
;
137 // Now we find out the start point of charset info. Search the end point.
138 while (remaining_length
--) {
139 if (content
[end_pos
] <= 0x0020 || content
[end_pos
] == L
';')
143 // Get actual charset info.
144 charset_info
= content
.substr(start_pos
, end_pos
- start_pos
);
151 class LoadObserver
: public RenderViewObserver
{
153 LoadObserver(RenderView
* render_view
, const base::Closure
& quit_closure
)
154 : RenderViewObserver(render_view
),
155 quit_closure_(quit_closure
) {}
157 void DidFinishLoad(blink::WebLocalFrame
* frame
) override
{
158 if (frame
== render_view()->GetWebView()->mainFrame())
163 base::Closure quit_closure_
;
166 class DomSerializerTests
: public ContentBrowserTest
,
167 public WebPageSerializerClient
{
170 : serialized_(false),
171 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
173 void SetUpCommandLine(base::CommandLine
* command_line
) override
{
174 command_line
->AppendSwitch(switches::kSingleProcess
);
176 // Don't want to try to create a GPU process.
177 command_line
->AppendSwitch(switches::kDisableGpu
);
181 // DomSerializerDelegate.
182 virtual void didSerializeDataForFrame(const WebURL
& frame_web_url
,
183 const WebCString
& data
,
184 PageSerializationStatus status
) {
186 GURL
frame_url(frame_web_url
);
187 // If the all frames are finished saving, check all finish status
188 if (status
== WebPageSerializerClient::AllFramesAreFinished
) {
189 SerializationFinishStatusMap::iterator it
=
190 serialization_finish_status_
.begin();
191 for (; it
!= serialization_finish_status_
.end(); ++it
)
192 ASSERT_TRUE(it
->second
);
197 // Check finish status of current frame.
198 SerializationFinishStatusMap::iterator it
=
199 serialization_finish_status_
.find(frame_url
.spec());
200 // New frame, set initial status as false.
201 if (it
== serialization_finish_status_
.end())
202 serialization_finish_status_
[frame_url
.spec()] = false;
204 it
= serialization_finish_status_
.find(frame_url
.spec());
205 ASSERT_TRUE(it
!= serialization_finish_status_
.end());
206 // In process frame, finish status should be false.
207 ASSERT_FALSE(it
->second
);
209 // Add data to corresponding frame's content.
210 serialized_frame_map_
[frame_url
.spec()] += data
.data();
212 // Current frame is completed saving, change the finish status.
213 if (status
== WebPageSerializerClient::CurrentFrameIsFinished
)
217 bool HasSerializedFrame(const GURL
& frame_url
) {
218 return serialized_frame_map_
.find(frame_url
.spec()) !=
219 serialized_frame_map_
.end();
222 const std::string
& GetSerializedContentForFrame(
223 const GURL
& frame_url
) {
224 return serialized_frame_map_
[frame_url
.spec()];
227 RenderView
* GetRenderView() {
228 // We could have the test on the UI thread get the WebContent's routing ID,
229 // but we know this will be the first RV so skip that and just hardcode it.
230 return RenderView::FromRoutingID(kRenderViewRoutingId
);
233 WebView
* GetWebView() {
234 return GetRenderView()->GetWebView();
237 WebFrame
* GetMainFrame() {
238 return GetWebView()->mainFrame();
241 // Load web page according to input content and relative URLs within
243 void LoadContents(const std::string
& contents
,
244 const GURL
& base_url
,
245 const WebString encoding_info
) {
246 scoped_refptr
<MessageLoopRunner
> runner
= new MessageLoopRunner
;
247 LoadObserver
observer(GetRenderView(), runner
->QuitClosure());
249 // If input encoding is empty, use UTF-8 as default encoding.
250 if (encoding_info
.isEmpty()) {
251 GetMainFrame()->loadHTMLString(contents
, base_url
);
253 WebData
data(contents
.data(), contents
.length());
255 // Do not use WebFrame.LoadHTMLString because it assumes that input
256 // html contents use UTF-8 encoding.
257 // TODO(darin): This should use WebFrame::loadData.
258 WebFrame
* web_frame
= GetMainFrame();
260 ASSERT_TRUE(web_frame
!= NULL
);
262 web_frame
->loadData(data
, "text/html", encoding_info
, base_url
);
268 // Serialize page DOM according to specific page URL. The parameter
269 // recursive_serialization indicates whether we will serialize all
271 void SerializeDomForURL(const GURL
& page_url
,
272 bool recursive_serialization
) {
273 // Find corresponding WebFrame according to page_url.
274 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), page_url
);
275 ASSERT_TRUE(web_frame
!= NULL
);
276 WebVector
<WebURL
> links
;
277 links
.assign(&page_url
, 1);
278 WebString file_path
=
279 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
280 WebVector
<WebString
> local_paths
;
281 local_paths
.assign(&file_path
, 1);
282 // Start serializing DOM.
283 bool result
= WebPageSerializer::serialize(web_frame
->toWebLocalFrame(),
284 recursive_serialization
,
285 static_cast<WebPageSerializerClient
*>(this),
288 local_directory_name_
.AsUTF16Unsafe());
290 ASSERT_TRUE(serialized_
);
293 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL
& file_url
) {
294 // Make sure original contents have document type.
295 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
296 ASSERT_TRUE(web_frame
!= NULL
);
297 WebDocument doc
= web_frame
->document();
298 ASSERT_TRUE(HasDocType(doc
));
300 SerializeDomForURL(file_url
, false);
301 // Load the serialized contents.
302 ASSERT_TRUE(HasSerializedFrame(file_url
));
303 const std::string
& serialized_contents
=
304 GetSerializedContentForFrame(file_url
);
305 LoadContents(serialized_contents
, file_url
,
306 web_frame
->document().encoding());
307 // Make sure serialized contents still have document type.
308 web_frame
= GetMainFrame();
309 doc
= web_frame
->document();
310 ASSERT_TRUE(HasDocType(doc
));
313 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL
& file_url
) {
314 // Make sure original contents do not have document type.
315 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
316 ASSERT_TRUE(web_frame
!= NULL
);
317 WebDocument doc
= web_frame
->document();
318 ASSERT_TRUE(!HasDocType(doc
));
320 SerializeDomForURL(file_url
, false);
321 // Load the serialized contents.
322 ASSERT_TRUE(HasSerializedFrame(file_url
));
323 const std::string
& serialized_contents
=
324 GetSerializedContentForFrame(file_url
);
325 LoadContents(serialized_contents
, file_url
,
326 web_frame
->document().encoding());
327 // Make sure serialized contents do not have document type.
328 web_frame
= GetMainFrame();
329 doc
= web_frame
->document();
330 ASSERT_TRUE(!HasDocType(doc
));
333 void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
334 const GURL
& xml_file_url
, const std::string
& original_contents
) {
336 SerializeDomForURL(xml_file_url
, false);
337 // Compare the serialized contents with original contents.
338 ASSERT_TRUE(HasSerializedFrame(xml_file_url
));
339 const std::string
& serialized_contents
=
340 GetSerializedContentForFrame(xml_file_url
);
341 ASSERT_EQ(original_contents
, serialized_contents
);
344 void SerializeHTMLDOMWithAddingMOTWOnRenderer(
345 const GURL
& file_url
, const std::string
& original_contents
) {
346 // Make sure original contents does not have MOTW;
347 std::string motw_declaration
=
348 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
349 ASSERT_FALSE(motw_declaration
.empty());
350 // The encoding of original contents is ISO-8859-1, so we convert the MOTW
351 // declaration to ASCII and search whether original contents has it or not.
352 ASSERT_TRUE(std::string::npos
== original_contents
.find(motw_declaration
));
355 SerializeDomForURL(file_url
, false);
356 // Make sure the serialized contents have MOTW ;
357 ASSERT_TRUE(HasSerializedFrame(file_url
));
358 const std::string
& serialized_contents
=
359 GetSerializedContentForFrame(file_url
);
360 ASSERT_FALSE(std::string::npos
==
361 serialized_contents
.find(motw_declaration
));
364 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
365 const GURL
& file_url
) {
366 // Make sure there is no META charset declaration in original document.
367 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
368 ASSERT_TRUE(web_frame
!= NULL
);
369 WebDocument doc
= web_frame
->document();
370 ASSERT_TRUE(doc
.isHTMLDocument());
371 WebElement head_element
= doc
.head();
372 ASSERT_TRUE(!head_element
.isNull());
373 // Go through all children of HEAD element.
374 for (WebNode child
= head_element
.firstChild(); !child
.isNull();
375 child
= child
.nextSibling()) {
376 std::string charset_info
;
377 if (IsMetaElement(child
, charset_info
))
378 ASSERT_TRUE(charset_info
.empty());
381 SerializeDomForURL(file_url
, false);
383 // Load the serialized contents.
384 ASSERT_TRUE(HasSerializedFrame(file_url
));
385 const std::string
& serialized_contents
=
386 GetSerializedContentForFrame(file_url
);
387 LoadContents(serialized_contents
, file_url
,
388 web_frame
->document().encoding());
389 // Make sure the first child of HEAD element is META which has charset
390 // declaration in serialized contents.
391 web_frame
= GetMainFrame();
392 ASSERT_TRUE(web_frame
!= NULL
);
393 doc
= web_frame
->document();
394 ASSERT_TRUE(doc
.isHTMLDocument());
395 head_element
= doc
.head();
396 ASSERT_TRUE(!head_element
.isNull());
397 WebNode meta_node
= head_element
.firstChild();
398 ASSERT_TRUE(!meta_node
.isNull());
399 // Get meta charset info.
400 std::string charset_info2
;
401 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info2
));
402 ASSERT_TRUE(!charset_info2
.empty());
403 ASSERT_EQ(charset_info2
,
404 std::string(web_frame
->document().encoding().utf8()));
406 // Make sure no more additional META tags which have charset declaration.
407 for (WebNode child
= meta_node
.nextSibling(); !child
.isNull();
408 child
= child
.nextSibling()) {
409 std::string charset_info
;
410 if (IsMetaElement(child
, charset_info
))
411 ASSERT_TRUE(charset_info
.empty());
415 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
416 const GURL
& file_url
) {
417 // Make sure there are multiple META charset declarations in original
419 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
420 ASSERT_TRUE(web_frame
!= NULL
);
421 WebDocument doc
= web_frame
->document();
422 ASSERT_TRUE(doc
.isHTMLDocument());
423 WebElement head_ele
= doc
.head();
424 ASSERT_TRUE(!head_ele
.isNull());
425 // Go through all children of HEAD element.
426 int charset_declaration_count
= 0;
427 for (WebNode child
= head_ele
.firstChild(); !child
.isNull();
428 child
= child
.nextSibling()) {
429 std::string charset_info
;
430 if (IsMetaElement(child
, charset_info
) && !charset_info
.empty())
431 charset_declaration_count
++;
433 // The original doc has more than META tags which have charset declaration.
434 ASSERT_TRUE(charset_declaration_count
> 1);
437 SerializeDomForURL(file_url
, false);
439 // Load the serialized contents.
440 ASSERT_TRUE(HasSerializedFrame(file_url
));
441 const std::string
& serialized_contents
=
442 GetSerializedContentForFrame(file_url
);
443 LoadContents(serialized_contents
, file_url
,
444 web_frame
->document().encoding());
445 // Make sure only first child of HEAD element is META which has charset
446 // declaration in serialized contents.
447 web_frame
= GetMainFrame();
448 ASSERT_TRUE(web_frame
!= NULL
);
449 doc
= web_frame
->document();
450 ASSERT_TRUE(doc
.isHTMLDocument());
451 head_ele
= doc
.head();
452 ASSERT_TRUE(!head_ele
.isNull());
453 WebNode meta_node
= head_ele
.firstChild();
454 ASSERT_TRUE(!meta_node
.isNull());
455 // Get meta charset info.
456 std::string charset_info2
;
457 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info2
));
458 ASSERT_TRUE(!charset_info2
.empty());
459 ASSERT_EQ(charset_info2
,
460 std::string(web_frame
->document().encoding().utf8()));
462 // Make sure no more additional META tags which have charset declaration.
463 for (WebNode child
= meta_node
.nextSibling(); !child
.isNull();
464 child
= child
.nextSibling()) {
465 std::string charset_info
;
466 if (IsMetaElement(child
, charset_info
))
467 ASSERT_TRUE(charset_info
.empty());
471 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
472 base::FilePath page_file_path
= GetTestFilePath(
473 "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
474 // Get file URL. The URL is dummy URL to identify the following loading
475 // actions. The test content is in constant:original_contents.
476 GURL file_url
= net::FilePathToFileURL(page_file_path
);
477 ASSERT_TRUE(file_url
.SchemeIsFile());
479 static const char* const original_contents
=
480 "<html><body>&<>\"\'</body></html>";
481 // Load the test contents.
482 LoadContents(original_contents
, file_url
, WebString());
484 // Get BODY's text content in DOM.
485 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
486 ASSERT_TRUE(web_frame
!= NULL
);
487 WebDocument doc
= web_frame
->document();
488 ASSERT_TRUE(doc
.isHTMLDocument());
489 WebElement body_ele
= doc
.body();
490 ASSERT_TRUE(!body_ele
.isNull());
491 WebNode text_node
= body_ele
.firstChild();
492 ASSERT_TRUE(text_node
.isTextNode());
493 ASSERT_TRUE(std::string(text_node
.createMarkup().utf8()) ==
494 "&<>\"\'");
496 SerializeDomForURL(file_url
, false);
497 // Compare the serialized contents with original contents.
498 ASSERT_TRUE(HasSerializedFrame(file_url
));
499 const std::string
& serialized_contents
=
500 GetSerializedContentForFrame(file_url
);
501 // Compare the serialized contents with original contents to make sure
503 // Because we add MOTW when serializing DOM, so before comparison, we also
504 // need to add MOTW to original_contents.
505 std::string original_str
=
506 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
507 original_str
+= original_contents
;
508 // Since WebCore now inserts a new HEAD element if there is no HEAD element
509 // when creating BODY element. (Please see
510 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
511 // corresponding META content if we find WebCore-generated HEAD element.
512 if (!doc
.head().isNull()) {
513 WebString encoding
= web_frame
->document().encoding();
514 std::string
htmlTag("<html>");
515 std::string::size_type pos
= original_str
.find(htmlTag
);
516 ASSERT_NE(std::string::npos
, pos
);
517 pos
+= htmlTag
.length();
518 std::string
head_part("<head>");
520 WebPageSerializer::generateMetaCharsetDeclaration(encoding
).utf8();
521 head_part
+= "</head>";
522 original_str
.insert(pos
, head_part
);
524 ASSERT_EQ(original_str
, serialized_contents
);
527 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
528 base::FilePath page_file_path
= GetTestFilePath(
529 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
530 // Get file URL. The URL is dummy URL to identify the following loading
531 // actions. The test content is in constant:original_contents.
532 GURL file_url
= net::FilePathToFileURL(page_file_path
);
533 ASSERT_TRUE(file_url
.SchemeIsFile());
535 static const char* const original_contents
=
536 "<html><body title=\"&<>"'\"></body></html>";
537 // Load the test contents.
538 LoadContents(original_contents
, file_url
, WebString());
539 // Get value of BODY's title attribute in DOM.
540 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
541 ASSERT_TRUE(web_frame
!= NULL
);
542 WebDocument doc
= web_frame
->document();
543 ASSERT_TRUE(doc
.isHTMLDocument());
544 WebElement body_ele
= doc
.body();
545 ASSERT_TRUE(!body_ele
.isNull());
546 WebString value
= body_ele
.getAttribute("title");
547 ASSERT_TRUE(std::string(value
.utf8()) == "&<>\"\'");
549 SerializeDomForURL(file_url
, false);
550 // Compare the serialized contents with original contents.
551 ASSERT_TRUE(HasSerializedFrame(file_url
));
552 const std::string
& serialized_contents
=
553 GetSerializedContentForFrame(file_url
);
554 // Compare the serialized contents with original contents to make sure
556 std::string original_str
=
557 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url
).utf8();
558 original_str
+= original_contents
;
560 WebString encoding
= web_frame
->document().encoding();
561 std::string
htmlTag("<html>");
562 std::string::size_type pos
= original_str
.find(htmlTag
);
563 ASSERT_NE(std::string::npos
, pos
);
564 pos
+= htmlTag
.length();
565 std::string
head_part("<head>");
567 WebPageSerializer::generateMetaCharsetDeclaration(encoding
).utf8();
568 head_part
+= "</head>";
569 original_str
.insert(pos
, head_part
);
571 ASSERT_EQ(original_str
, serialized_contents
);
574 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL
& file_url
) {
575 // Get value of BODY's title attribute in DOM.
576 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
577 WebDocument doc
= web_frame
->document();
578 ASSERT_TRUE(doc
.isHTMLDocument());
579 WebElement body_element
= doc
.body();
580 // Unescaped string for "%⊅¹'".
581 static const wchar_t parsed_value
[] = {
582 '%', 0x2285, 0x00b9, '\'', 0
584 WebString value
= body_element
.getAttribute("title");
585 WebString content
= doc
.contentAsTextForTesting();
586 ASSERT_TRUE(base::UTF16ToWide(value
) == parsed_value
);
587 ASSERT_TRUE(base::UTF16ToWide(content
) == parsed_value
);
590 SerializeDomForURL(file_url
, false);
591 // Check the serialized string.
592 ASSERT_TRUE(HasSerializedFrame(file_url
));
593 const std::string
& serialized_contents
=
594 GetSerializedContentForFrame(file_url
);
595 // Confirm that the serialized string has no non-standard HTML entities.
596 ASSERT_EQ(std::string::npos
, serialized_contents
.find("%"));
597 ASSERT_EQ(std::string::npos
, serialized_contents
.find("⊅"));
598 ASSERT_EQ(std::string::npos
, serialized_contents
.find("¹"));
599 ASSERT_EQ(std::string::npos
, serialized_contents
.find("'"));
602 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL
& file_url
,
603 const GURL
& path_dir_url
) {
604 // There are total 2 available base tags in this test file.
605 const int kTotalBaseTagCountInTestFile
= 2;
607 // Since for this test, we assume there is no savable sub-resource links for
608 // this test file, also all links are relative URLs in this test file, so we
609 // need to check those relative URLs and make sure document has BASE tag.
610 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
611 ASSERT_TRUE(web_frame
!= NULL
);
612 WebDocument doc
= web_frame
->document();
613 ASSERT_TRUE(doc
.isHTMLDocument());
614 // Go through all descent nodes.
615 WebElementCollection all
= doc
.all();
616 int original_base_tag_count
= 0;
617 for (WebElement element
= all
.firstItem(); !element
.isNull();
618 element
= all
.nextItem()) {
619 if (element
.hasHTMLTagName("base")) {
620 original_base_tag_count
++;
623 WebString value
= GetSubResourceLinkFromElement(element
);
624 if (value
.isNull() && element
.hasHTMLTagName("a")) {
625 value
= element
.getAttribute("href");
629 // Each link is relative link.
630 if (!value
.isNull()) {
631 GURL
link(value
.utf8());
632 ASSERT_TRUE(link
.scheme().empty());
636 ASSERT_EQ(original_base_tag_count
, kTotalBaseTagCountInTestFile
);
637 // Make sure in original document, the base URL is not equal with the
639 GURL
original_base_url(doc
.baseURL());
640 ASSERT_NE(original_base_url
, path_dir_url
);
643 SerializeDomForURL(file_url
, false);
645 // Load the serialized contents.
646 ASSERT_TRUE(HasSerializedFrame(file_url
));
647 const std::string
& serialized_contents
=
648 GetSerializedContentForFrame(file_url
);
649 LoadContents(serialized_contents
, file_url
,
650 web_frame
->document().encoding());
652 // Make sure all links are absolute URLs and doc there are some number of
653 // BASE tags in serialized HTML data. Each of those BASE tags have same base
654 // URL which is as same as URL of current test file.
655 web_frame
= GetMainFrame();
656 ASSERT_TRUE(web_frame
!= NULL
);
657 doc
= web_frame
->document();
658 ASSERT_TRUE(doc
.isHTMLDocument());
659 // Go through all descent nodes.
661 int new_base_tag_count
= 0;
662 for (WebNode node
= all
.firstItem(); !node
.isNull();
663 node
= all
.nextItem()) {
664 if (!node
.isElementNode())
666 WebElement element
= node
.to
<WebElement
>();
667 if (element
.hasHTMLTagName("base")) {
668 new_base_tag_count
++;
671 WebString value
= GetSubResourceLinkFromElement(element
);
672 if (value
.isNull() && element
.hasHTMLTagName("a")) {
673 value
= element
.getAttribute("href");
677 // Each link is absolute link.
678 if (!value
.isNull()) {
679 GURL
link(std::string(value
.utf8()));
680 ASSERT_FALSE(link
.scheme().empty());
684 // We should have the same amount of base tags
685 ASSERT_EQ(new_base_tag_count
, original_base_tag_count
);
686 // Make sure in new document, the base URL is equal with the |path_dir_url|.
687 GURL
new_base_url(doc
.baseURL());
688 ASSERT_EQ(new_base_url
, path_dir_url
);
691 void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
692 base::FilePath page_file_path
= GetTestFilePath(
693 "dom_serializer", "empty_head.htm");
694 GURL file_url
= net::FilePathToFileURL(page_file_path
);
695 ASSERT_TRUE(file_url
.SchemeIsFile());
697 // Load the test html content.
698 static const char* const empty_head_contents
=
699 "<html><head></head><body>hello world</body></html>";
700 LoadContents(empty_head_contents
, file_url
, WebString());
702 // Make sure the head tag is empty.
703 WebFrame
* web_frame
= GetMainFrame();
704 ASSERT_TRUE(web_frame
!= NULL
);
705 WebDocument doc
= web_frame
->document();
706 ASSERT_TRUE(doc
.isHTMLDocument());
707 WebElement head_element
= doc
.head();
708 ASSERT_TRUE(!head_element
.isNull());
709 ASSERT_TRUE(!head_element
.hasChildNodes());
710 ASSERT_TRUE(head_element
.childNodes().length() == 0);
713 SerializeDomForURL(file_url
, false);
714 // Make sure the serialized contents have META ;
715 ASSERT_TRUE(HasSerializedFrame(file_url
));
716 const std::string
& serialized_contents
=
717 GetSerializedContentForFrame(file_url
);
719 // Reload serialized contents and make sure there is only one META tag.
720 LoadContents(serialized_contents
, file_url
,
721 web_frame
->document().encoding());
722 web_frame
= GetMainFrame();
723 ASSERT_TRUE(web_frame
!= NULL
);
724 doc
= web_frame
->document();
725 ASSERT_TRUE(doc
.isHTMLDocument());
726 head_element
= doc
.head();
727 ASSERT_TRUE(!head_element
.isNull());
728 ASSERT_TRUE(head_element
.hasChildNodes());
729 ASSERT_TRUE(head_element
.childNodes().length() == 1);
730 WebNode meta_node
= head_element
.firstChild();
731 ASSERT_TRUE(!meta_node
.isNull());
732 // Get meta charset info.
733 std::string charset_info
;
734 ASSERT_TRUE(IsMetaElement(meta_node
, charset_info
));
735 ASSERT_TRUE(!charset_info
.empty());
736 ASSERT_EQ(charset_info
,
737 std::string(web_frame
->document().encoding().utf8()));
739 // Check the body's first node is text node and its contents are
741 WebElement body_element
= doc
.body();
742 ASSERT_TRUE(!body_element
.isNull());
743 WebNode text_node
= body_element
.firstChild();
744 ASSERT_TRUE(text_node
.isTextNode());
745 WebString text_node_contents
= text_node
.nodeValue();
746 ASSERT_TRUE(std::string(text_node_contents
.utf8()) == "hello world");
749 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL
& file_url
) {
750 // Do a recursive serialization. We pass if we don't crash.
751 SerializeDomForURL(file_url
, true);
754 void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
755 const GURL
& file_url
) {
756 WebFrame
* web_frame
= FindSubFrameByURL(GetWebView(), file_url
);
757 ASSERT_TRUE(web_frame
!= NULL
);
758 WebDocument doc
= web_frame
->document();
759 WebNode lastNodeInBody
= doc
.body().lastChild();
760 ASSERT_EQ(WebNode::ElementNode
, lastNodeInBody
.nodeType());
761 WebString uri
= GetSubResourceLinkFromElement(
762 lastNodeInBody
.to
<WebElement
>());
763 EXPECT_TRUE(uri
.isNull());
767 // Map frame_url to corresponding serialized_content.
768 typedef base::hash_map
<std::string
, std::string
> SerializedFrameContentMap
;
769 SerializedFrameContentMap serialized_frame_map_
;
770 // Map frame_url to corresponding status of serialization finish.
771 typedef base::hash_map
<std::string
, bool> SerializationFinishStatusMap
;
772 SerializationFinishStatusMap serialization_finish_status_
;
773 // Flag indicates whether the process of serializing DOM is finished or not.
775 // The local_directory_name_ is dummy relative path of directory which
776 // contain all saved auxiliary files included all sub frames and resources.
777 const base::FilePath local_directory_name_
;
780 // If original contents have document type, the serialized contents also have
782 // Disabled by ellyjones@ on 2015-05-18, see https://crbug.com/488495.
783 #if defined(OS_MACOSX)
784 #define MAYBE_SerializeHTMLDOMWithDocType DISABLED_SerializeHTMLDOMWithDocType
786 #define MAYBE_SerializeHTMLDOMWithDocType SerializeHTMLDOMWithDocType
789 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
790 MAYBE_SerializeHTMLDOMWithDocType
) {
791 base::FilePath page_file_path
=
792 GetTestFilePath("dom_serializer", "youtube_1.htm");
793 GURL file_url
= net::FilePathToFileURL(page_file_path
);
794 ASSERT_TRUE(file_url
.SchemeIsFile());
795 // Load the test file.
796 NavigateToURL(shell(), file_url
);
798 PostTaskToInProcessRendererAndWait(
799 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer
,
800 base::Unretained(this), file_url
));
803 // If original contents do not have document type, the serialized contents
804 // also do not have document type.
805 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithoutDocType
) {
806 base::FilePath page_file_path
=
807 GetTestFilePath("dom_serializer", "youtube_2.htm");
808 GURL file_url
= net::FilePathToFileURL(page_file_path
);
809 ASSERT_TRUE(file_url
.SchemeIsFile());
810 // Load the test file.
811 NavigateToURL(shell(), file_url
);
813 PostTaskToInProcessRendererAndWait(
815 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer
,
816 base::Unretained(this), file_url
));
819 // Serialize XML document which has all 5 built-in entities. After
820 // finishing serialization, the serialized contents should be same
821 // with original XML document.
822 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeXMLDocWithBuiltInEntities
) {
823 base::FilePath page_file_path
=
824 GetTestFilePath("dom_serializer", "note.html");
825 base::FilePath xml_file_path
= GetTestFilePath("dom_serializer", "note.xml");
826 // Read original contents for later comparison.
827 std::string original_contents
;
828 ASSERT_TRUE(base::ReadFileToString(xml_file_path
, &original_contents
));
830 GURL file_url
= net::FilePathToFileURL(page_file_path
);
831 GURL xml_file_url
= net::FilePathToFileURL(xml_file_path
);
832 ASSERT_TRUE(file_url
.SchemeIsFile());
833 // Load the test file.
834 NavigateToURL(shell(), file_url
);
836 PostTaskToInProcessRendererAndWait(
838 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer
,
839 base::Unretained(this), xml_file_url
, original_contents
));
842 // When serializing DOM, we add MOTW declaration before html tag.
843 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithAddingMOTW
) {
844 base::FilePath page_file_path
=
845 GetTestFilePath("dom_serializer", "youtube_2.htm");
846 // Read original contents for later comparison .
847 std::string original_contents
;
848 ASSERT_TRUE(base::ReadFileToString(page_file_path
, &original_contents
));
850 GURL file_url
= net::FilePathToFileURL(page_file_path
);
851 ASSERT_TRUE(file_url
.SchemeIsFile());
853 // Load the test file.
854 NavigateToURL(shell(), file_url
);
856 PostTaskToInProcessRendererAndWait(
858 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer
,
859 base::Unretained(this), file_url
, original_contents
));
862 // When serializing DOM, we will add the META which have correct charset
863 // declaration as first child of HEAD element for resolving WebKit bug:
864 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
865 // does not have META charset declaration.
866 // Disabled by battre@ on 2015-05-21, see https://crbug.com/488495.
867 #if defined(OS_MACOSX)
868 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
869 DISABLED_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
871 #define MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc \
872 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
874 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
875 MAYBE_SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc
) {
876 base::FilePath page_file_path
=
877 GetTestFilePath("dom_serializer", "youtube_1.htm");
879 GURL file_url
= net::FilePathToFileURL(page_file_path
);
880 ASSERT_TRUE(file_url
.SchemeIsFile());
881 // Load the test file.
882 NavigateToURL(shell(), file_url
);
884 PostTaskToInProcessRendererAndWait(
886 &DomSerializerTests::
887 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer
,
888 base::Unretained(this), file_url
));
891 // When serializing DOM, if the original document has multiple META charset
892 // declaration, we will add the META which have correct charset declaration
893 // as first child of HEAD element and remove all original META charset
895 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
896 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc
) {
897 base::FilePath page_file_path
=
898 GetTestFilePath("dom_serializer", "youtube_2.htm");
900 GURL file_url
= net::FilePathToFileURL(page_file_path
);
901 ASSERT_TRUE(file_url
.SchemeIsFile());
902 // Load the test file.
903 NavigateToURL(shell(), file_url
);
905 PostTaskToInProcessRendererAndWait(
907 &DomSerializerTests::
908 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer
,
909 base::Unretained(this), file_url
));
912 // Test situation of html entities in text when serializing HTML DOM.
913 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithEntitiesInText
) {
914 // Need to spin up the renderer and also navigate to a file url so that the
915 // renderer code doesn't attempt a fork when it sees a load to file scheme
916 // from non-file scheme.
917 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
919 PostTaskToInProcessRendererAndWait(
921 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer
,
922 base::Unretained(this)));
925 // Test situation of html entities in attribute value when serializing
927 // This test started to fail at WebKit r65388. See http://crbug.com/52279.
928 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
929 SerializeHTMLDOMWithEntitiesInAttributeValue
) {
930 // Need to spin up the renderer and also navigate to a file url so that the
931 // renderer code doesn't attempt a fork when it sees a load to file scheme
932 // from non-file scheme.
933 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
935 PostTaskToInProcessRendererAndWait(
937 &DomSerializerTests::
938 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer
,
939 base::Unretained(this)));
942 // Test situation of non-standard HTML entities when serializing HTML DOM.
943 // This test started to fail at WebKit r65351. See http://crbug.com/52279.
944 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
945 SerializeHTMLDOMWithNonStandardEntities
) {
946 // Make a test file URL and load it.
947 base::FilePath page_file_path
= GetTestFilePath(
948 "dom_serializer", "nonstandard_htmlentities.htm");
949 GURL file_url
= net::FilePathToFileURL(page_file_path
);
950 NavigateToURL(shell(), file_url
);
952 PostTaskToInProcessRendererAndWait(
954 &DomSerializerTests::
955 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer
,
956 base::Unretained(this), file_url
));
959 // Test situation of BASE tag in original document when serializing HTML DOM.
960 // When serializing, we should comment the BASE tag, append a new BASE tag.
961 // rewrite all the savable URLs to relative local path, and change other URLs
963 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithBaseTag
) {
964 base::FilePath page_file_path
= GetTestFilePath(
965 "dom_serializer", "html_doc_has_base_tag.htm");
967 // Get page dir URL which is base URL of this file.
968 base::FilePath dir_name
= page_file_path
.DirName();
969 dir_name
= dir_name
.Append(
970 base::FilePath::StringType(base::FilePath::kSeparators
[0], 1));
971 GURL path_dir_url
= net::FilePathToFileURL(dir_name
);
974 GURL file_url
= net::FilePathToFileURL(page_file_path
);
975 ASSERT_TRUE(file_url
.SchemeIsFile());
976 // Load the test file.
977 NavigateToURL(shell(), file_url
);
979 PostTaskToInProcessRendererAndWait(
981 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer
,
982 base::Unretained(this), file_url
, path_dir_url
));
985 // Serializing page which has an empty HEAD tag.
986 IN_PROC_BROWSER_TEST_F(DomSerializerTests
, SerializeHTMLDOMWithEmptyHead
) {
987 // Need to spin up the renderer and also navigate to a file url so that the
988 // renderer code doesn't attempt a fork when it sees a load to file scheme
989 // from non-file scheme.
990 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
992 PostTaskToInProcessRendererAndWait(
993 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer
,
994 base::Unretained(this)));
997 // Test that we don't crash when the page contains an iframe that
998 // was handled as a download (http://crbug.com/42212).
999 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
1000 SerializeDocumentWithDownloadedIFrame
) {
1001 base::FilePath page_file_path
= GetTestFilePath(
1002 "dom_serializer", "iframe-src-is-exe.htm");
1003 GURL file_url
= net::FilePathToFileURL(page_file_path
);
1004 ASSERT_TRUE(file_url
.SchemeIsFile());
1005 // Load the test file.
1006 NavigateToURL(shell(), file_url
);
1008 PostTaskToInProcessRendererAndWait(
1010 &DomSerializerTests::
1011 SerializeDocumentWithDownloadedIFrameOnRenderer
,
1012 base::Unretained(this), file_url
));
1015 IN_PROC_BROWSER_TEST_F(DomSerializerTests
,
1016 SubResourceForElementsInNonHTMLNamespace
) {
1017 base::FilePath page_file_path
= GetTestFilePath(
1018 "dom_serializer", "non_html_namespace.htm");
1019 GURL file_url
= net::FilePathToFileURL(page_file_path
);
1020 NavigateToURL(shell(), file_url
);
1022 PostTaskToInProcessRendererAndWait(
1024 &DomSerializerTests::
1025 SubResourceForElementsInNonHTMLNamespaceOnRenderer
,
1026 base::Unretained(this), file_url
));
1029 } // namespace content