2 * Copyright (C) 2009 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // How we handle the base tag better.
33 // At now the normal way we use to handling base tag is
34 // a) For those links which have corresponding local saved files, such as
35 // savable CSS, JavaScript files, they will be written to relative URLs which
36 // point to local saved file. Why those links can not be resolved as absolute
37 // file URLs, because if they are resolved as absolute URLs, after moving the
38 // file location from one directory to another directory, the file URLs will
40 // b) For those links which have not corresponding local saved files, such as
41 // links in A, AREA tags, they will be resolved as absolute URLs.
42 // c) We comment all base tags when serialzing DOM for the page.
43 // FireFox also uses above way to handle base tag.
46 // This way can not handle the following situation:
47 // the base tag is written by JavaScript.
48 // For example. The page "www.yahoo.com" use
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
50 // of page when loading page. So when saving page as completed-HTML, we assume
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
52 // completed-HTML page, then the JavaScript will insert a base tag
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
54 // local saved resource files will be resolved as
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
56 // files can not be loaded correctly. Also the page will be rendered ugly since
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
58 // files can not be fetched.
59 // Now FireFox, IE and WebKit based Browser all have this problem.
62 // My solution is that we comment old base tag and write new base tag:
63 // <base href="." ...> after the previous commented base tag. In WebKit, it
64 // always uses the latest "href" attribute of base tag to set document's base
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
66 // write a new base tag <base href="."> after the previous commented base tag.
67 // The new added base tag can help engine to locate correct base URL for
68 // correctly loading local saved resource files. Also I think we need to inherit
69 // the base target value from document object when appending new base tag.
70 // If there are multiple base tags in original document, we will comment all old
71 // base tags and append new base tag after each old base tag because we do not
72 // know those old base tags are original content or added by JavaScript. If
73 // they are added by JavaScript, it means when loading saved page, the script(s)
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
75 // override the incorrect base URL and make sure we alway load correct local
76 // saved resource files.
79 #include "web/WebPageSerializerImpl.h"
81 #include "core/HTMLNames.h"
82 #include "core/dom/Document.h"
83 #include "core/dom/DocumentType.h"
84 #include "core/dom/Element.h"
85 #include "core/editing/serializers/Serialization.h"
86 #include "core/html/HTMLAllCollection.h"
87 #include "core/html/HTMLElement.h"
88 #include "core/html/HTMLFormElement.h"
89 #include "core/html/HTMLHtmlElement.h"
90 #include "core/html/HTMLMetaElement.h"
91 #include "core/loader/DocumentLoader.h"
92 #include "core/loader/FrameLoader.h"
93 #include "public/platform/WebVector.h"
94 #include "web/WebLocalFrameImpl.h"
95 #include "wtf/text/TextEncoding.h"
99 // Maximum length of data buffer which is used to temporary save generated
100 // html content data. This is a soft limit which might be passed if a very large
101 // contegious string is found in the page.
102 static const unsigned dataBufferCapacity
= 65536;
104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL
& url
,
105 const WTF::TextEncoding
& textEncoding
,
107 const String
& directoryName
)
109 , textEncoding(textEncoding
)
111 , directoryName(directoryName
)
112 , isHTMLDocument(document
->isHTMLDocument())
113 , haveSeenDocType(false)
114 , haveAddedCharsetDeclaration(false)
115 , skipMetaElement(nullptr)
116 , isInScriptOrStyleTag(false)
117 , haveAddedXMLProcessingDirective(false)
118 , haveAddedContentsBeforeEnd(false)
122 String
WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
123 const Element
* element
, SerializeDomParam
* param
, bool* needSkip
)
125 StringBuilder result
;
128 if (param
->isHTMLDocument
) {
129 // Skip the open tag of original META tag which declare charset since we
130 // have overrided the META which have correct charset declaration after
131 // serializing open tag of HEAD element.
133 if (isHTMLMetaElement(*element
)) {
134 const HTMLMetaElement
& meta
= toHTMLMetaElement(*element
);
135 // Check whether the META tag has declared charset or not.
136 String equiv
= meta
.httpEquiv();
137 if (equalIgnoringCase(equiv
, "content-type")) {
138 String content
= meta
.content();
139 if (content
.length() && content
.contains("charset", TextCaseInsensitive
)) {
140 // Find META tag declared charset, we need to skip it when
142 param
->skipMetaElement
= element
;
146 } else if (isHTMLHtmlElement(*element
)) {
147 // Check something before processing the open tag of HEAD element.
148 // First we add doc type declaration if original document has it.
149 if (!param
->haveSeenDocType
) {
150 param
->haveSeenDocType
= true;
151 result
.append(createMarkup(param
->document
->doctype()));
154 // Add MOTW declaration before html tag.
155 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
156 result
.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param
->url
));
157 } else if (isHTMLBaseElement(*element
)) {
158 // Comment the BASE tag when serializing dom.
159 result
.appendLiteral("<!--");
162 // Write XML declaration.
163 if (!param
->haveAddedXMLProcessingDirective
) {
164 param
->haveAddedXMLProcessingDirective
= true;
165 // Get encoding info.
166 String xmlEncoding
= param
->document
->xmlEncoding();
167 if (xmlEncoding
.isEmpty())
168 xmlEncoding
= param
->document
->encodingName();
169 if (xmlEncoding
.isEmpty())
170 xmlEncoding
= UTF8Encoding().name();
171 result
.appendLiteral("<?xml version=\"");
172 result
.append(param
->document
->xmlVersion());
173 result
.appendLiteral("\" encoding=\"");
174 result
.append(xmlEncoding
);
175 if (param
->document
->xmlStandalone())
176 result
.appendLiteral("\" standalone=\"yes");
177 result
.appendLiteral("\"?>\n");
179 // Add doc type declaration if original document has it.
180 if (!param
->haveSeenDocType
) {
181 param
->haveSeenDocType
= true;
182 result
.append(createMarkup(param
->document
->doctype()));
185 return result
.toString();
188 String
WebPageSerializerImpl::postActionAfterSerializeOpenTag(
189 const Element
* element
, SerializeDomParam
* param
)
191 StringBuilder result
;
193 param
->haveAddedContentsBeforeEnd
= false;
194 if (!param
->isHTMLDocument
)
195 return result
.toString();
196 // Check after processing the open tag of HEAD element
197 if (!param
->haveAddedCharsetDeclaration
198 && isHTMLHeadElement(*element
)) {
199 param
->haveAddedCharsetDeclaration
= true;
200 // Check meta element. WebKit only pre-parse the first 512 bytes
201 // of the document. If the whole <HEAD> is larger and meta is the
202 // end of head part, then this kind of pages aren't decoded correctly
203 // because of this issue. So when we serialize the DOM, we need to
204 // make sure the meta will in first child of head tag.
205 // See http://bugs.webkit.org/show_bug.cgi?id=16621.
206 // First we generate new content for writing correct META element.
207 result
.append(WebPageSerializer::generateMetaCharsetDeclaration(
208 String(param
->textEncoding
.name())));
210 param
->haveAddedContentsBeforeEnd
= true;
211 // Will search each META which has charset declaration, and skip them all
212 // in PreActionBeforeSerializeOpenTag.
213 } else if (isHTMLScriptElement(*element
) || isHTMLScriptElement(*element
)) {
214 param
->isInScriptOrStyleTag
= true;
217 return result
.toString();
220 String
WebPageSerializerImpl::preActionBeforeSerializeEndTag(
221 const Element
* element
, SerializeDomParam
* param
, bool* needSkip
)
226 if (!param
->isHTMLDocument
)
228 // Skip the end tag of original META tag which declare charset.
229 // Need not to check whether it's META tag since we guarantee
230 // skipMetaElement is definitely META tag if it's not 0.
231 if (param
->skipMetaElement
== element
) {
233 } else if (isHTMLScriptElement(*element
) || isHTMLScriptElement(*element
)) {
234 ASSERT(param
->isInScriptOrStyleTag
);
235 param
->isInScriptOrStyleTag
= false;
241 // After we finish serializing end tag of a element, we give the target
242 // element a chance to do some post work to add some additional data.
243 String
WebPageSerializerImpl::postActionAfterSerializeEndTag(
244 const Element
* element
, SerializeDomParam
* param
)
246 StringBuilder result
;
248 if (!param
->isHTMLDocument
)
249 return result
.toString();
250 // Comment the BASE tag when serializing DOM.
251 if (isHTMLBaseElement(*element
)) {
252 result
.appendLiteral("-->");
253 // Append a new base tag declaration.
254 result
.append(WebPageSerializer::generateBaseTagDeclaration(
255 param
->document
->baseTarget()));
258 return result
.toString();
261 void WebPageSerializerImpl::saveHTMLContentToBuffer(
262 const String
& result
, SerializeDomParam
* param
)
264 m_dataBuffer
.append(result
);
265 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished
,
270 void WebPageSerializerImpl::encodeAndFlushBuffer(
271 WebPageSerializerClient::PageSerializationStatus status
,
272 SerializeDomParam
* param
,
273 FlushOption flushOption
)
275 // Data buffer is not full nor do we want to force flush.
276 if (flushOption
!= ForceFlush
&& m_dataBuffer
.length() <= dataBufferCapacity
)
279 String content
= m_dataBuffer
.toString();
280 m_dataBuffer
.clear();
282 CString encodedContent
= param
->textEncoding
.normalizeAndEncode(content
, WTF::EntitiesForUnencodables
);
284 // Send result to the client.
285 m_client
->didSerializeDataForFrame(param
->url
,
286 WebCString(encodedContent
.data(), encodedContent
.length()),
290 void WebPageSerializerImpl::openTagToString(Element
* element
,
291 SerializeDomParam
* param
)
294 StringBuilder result
;
295 // Do pre action for open tag.
296 result
.append(preActionBeforeSerializeOpenTag(element
, param
, &needSkip
));
301 result
.append(element
->nodeName().lower());
302 // Go through all attributes and serialize them.
303 AttributeCollection attributes
= element
->attributes();
304 AttributeCollection::iterator end
= attributes
.end();
305 for (AttributeCollection::iterator it
= attributes
.begin(); it
!= end
; ++it
) {
307 // Add attribute pair
308 result
.append(it
->name().toString());
309 result
.appendLiteral("=\"");
310 if (!it
->value().isEmpty()) {
311 const String
& attrValue
= it
->value();
313 // Check whether we need to replace some resource links
314 // with local resource paths.
315 const QualifiedName
& attrName
= it
->name();
316 if (element
->hasLegalLinkAttribute(attrName
)) {
317 // For links start with "javascript:", we do not change it.
318 if (attrValue
.startsWith("javascript:", TextCaseInsensitive
)) {
319 result
.append(attrValue
);
321 // Get the absolute link
322 WebLocalFrameImpl
* subFrame
= WebLocalFrameImpl::fromFrameOwnerElement(element
);
323 String completeURL
= subFrame
? subFrame
->frame()->document()->url() :
324 param
->document
->completeURL(attrValue
);
325 // Check whether we have local files for those link.
326 if (m_localLinks
.contains(completeURL
)) {
327 if (!param
->directoryName
.isEmpty()) {
328 result
.appendLiteral("./");
329 result
.append(param
->directoryName
);
332 result
.append(m_localLinks
.get(completeURL
));
334 result
.append(completeURL
);
338 if (param
->isHTMLDocument
)
339 result
.append(m_htmlEntities
.convertEntitiesInString(attrValue
));
341 result
.append(m_xmlEntities
.convertEntitiesInString(attrValue
));
347 // Do post action for open tag.
348 String addedContents
= postActionAfterSerializeOpenTag(element
, param
);
349 // Complete the open tag for element when it has child/children.
350 if (element
->hasChildren() || param
->haveAddedContentsBeforeEnd
)
352 // Append the added contents generate in post action of open tag.
353 result
.append(addedContents
);
354 // Save the result to data buffer.
355 saveHTMLContentToBuffer(result
.toString(), param
);
358 // Serialize end tag of an specified element.
359 void WebPageSerializerImpl::endTagToString(Element
* element
,
360 SerializeDomParam
* param
)
363 StringBuilder result
;
364 // Do pre action for end tag.
365 result
.append(preActionBeforeSerializeEndTag(element
, param
, &needSkip
));
368 // Write end tag when element has child/children.
369 if (element
->hasChildren() || param
->haveAddedContentsBeforeEnd
) {
370 result
.appendLiteral("</");
371 result
.append(element
->nodeName().lower());
374 // Check whether we have to write end tag for empty element.
375 if (param
->isHTMLDocument
) {
377 // FIXME: This code is horribly wrong. WebPageSerializerImpl must die.
378 if (!element
->isHTMLElement() || !toHTMLElement(element
)->ieForbidsInsertHTML()) {
379 // We need to write end tag when it is required.
380 result
.appendLiteral("</");
381 result
.append(element
->nodeName().lower());
385 // For xml base document.
386 result
.appendLiteral(" />");
389 // Do post action for end tag.
390 result
.append(postActionAfterSerializeEndTag(element
, param
));
391 // Save the result to data buffer.
392 saveHTMLContentToBuffer(result
.toString(), param
);
395 void WebPageSerializerImpl::buildContentForNode(Node
* node
,
396 SerializeDomParam
* param
)
398 switch (node
->nodeType()) {
399 case Node::ELEMENT_NODE
:
400 // Process open tag of element.
401 openTagToString(toElement(node
), param
);
402 // Walk through the children nodes and process it.
403 for (Node
*child
= node
->firstChild(); child
; child
= child
->nextSibling())
404 buildContentForNode(child
, param
);
405 // Process end tag of element.
406 endTagToString(toElement(node
), param
);
408 case Node::TEXT_NODE
:
409 saveHTMLContentToBuffer(createMarkup(node
), param
);
411 case Node::ATTRIBUTE_NODE
:
412 case Node::DOCUMENT_NODE
:
413 case Node::DOCUMENT_FRAGMENT_NODE
:
415 ASSERT_NOT_REACHED();
417 // Document type node can be in DOM?
418 case Node::DOCUMENT_TYPE_NODE
:
419 param
->haveSeenDocType
= true;
421 // For other type node, call default action.
422 saveHTMLContentToBuffer(createMarkup(node
), param
);
427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame
* frame
,
428 bool recursiveSerialization
,
429 WebPageSerializerClient
* client
,
430 const WebVector
<WebURL
>& links
,
431 const WebVector
<WebString
>& localPaths
,
432 const WebString
& localDirectoryName
)
434 , m_recursiveSerialization(recursiveSerialization
)
435 , m_framesCollected(false)
436 , m_localDirectoryName(localDirectoryName
)
437 , m_htmlEntities(false)
438 , m_xmlEntities(true)
440 // Must specify available webframe.
442 m_specifiedWebLocalFrameImpl
= toWebLocalFrameImpl(frame
);
443 // Make sure we have non 0 client.
445 // Build local resources map.
446 ASSERT(links
.size() == localPaths
.size());
447 for (size_t i
= 0; i
< links
.size(); i
++) {
449 ASSERT(!m_localLinks
.contains(url
.string()));
450 m_localLinks
.set(url
.string(), localPaths
[i
]);
453 ASSERT(m_dataBuffer
.isEmpty());
456 void WebPageSerializerImpl::collectTargetFrames()
458 ASSERT(!m_framesCollected
);
459 m_framesCollected
= true;
461 // First, process main frame.
462 m_frames
.append(m_specifiedWebLocalFrameImpl
);
463 // Return now if user only needs to serialize specified frame, not including
465 if (!m_recursiveSerialization
)
467 // Collect all frames inside the specified frame.
468 for (WebLocalFrameImpl
* frame
: m_frames
) {
469 // Get current using document.
470 Document
* currentDoc
= frame
->frame()->document();
471 // Go through sub-frames.
472 RefPtrWillBeRawPtr
<HTMLAllCollection
> all
= currentDoc
->all();
474 for (unsigned i
= 0; Element
* element
= all
->item(i
); ++i
) {
475 if (!element
->isHTMLElement())
477 WebLocalFrameImpl
* webFrame
=
478 WebLocalFrameImpl::fromFrameOwnerElement(element
);
480 m_frames
.append(webFrame
);
485 bool WebPageSerializerImpl::serialize()
487 if (!m_framesCollected
)
488 collectTargetFrames();
490 bool didSerialization
= false;
491 KURL mainURL
= m_specifiedWebLocalFrameImpl
->frame()->document()->url();
493 for (unsigned i
= 0; i
< m_frames
.size(); ++i
) {
494 WebLocalFrameImpl
* webFrame
= m_frames
[i
];
495 Document
* document
= webFrame
->frame()->document();
496 const KURL
& url
= document
->url();
498 if (!url
.isValid() || !m_localLinks
.contains(url
.string()))
501 didSerialization
= true;
503 const WTF::TextEncoding
& textEncoding
= document
->encoding().isValid() ? document
->encoding() : UTF8Encoding();
504 String directoryName
= url
== mainURL
? m_localDirectoryName
: "";
506 SerializeDomParam
param(url
, textEncoding
, document
, directoryName
);
508 Element
* documentElement
= document
->documentElement();
510 buildContentForNode(documentElement
, ¶m
);
512 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished
, ¶m
, ForceFlush
);
515 ASSERT(m_dataBuffer
.isEmpty());
516 m_client
->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished
);
517 return didSerialization
;