2 * Copyright (C) 2009 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #include "public/web/WebPageSerializer.h"
34 #include "core/HTMLNames.h"
35 #include "core/dom/Document.h"
36 #include "core/dom/Element.h"
37 #include "core/frame/LocalFrame.h"
38 #include "core/html/HTMLAllCollection.h"
39 #include "core/html/HTMLFrameElementBase.h"
40 #include "core/html/HTMLFrameOwnerElement.h"
41 #include "core/html/HTMLInputElement.h"
42 #include "core/html/HTMLTableElement.h"
43 #include "core/loader/DocumentLoader.h"
44 #include "core/page/Page.h"
45 #include "core/page/PageSerializer.h"
46 #include "platform/SerializedResource.h"
47 #include "platform/mhtml/MHTMLArchive.h"
48 #include "platform/weborigin/KURL.h"
49 #include "public/platform/WebCString.h"
50 #include "public/platform/WebString.h"
51 #include "public/platform/WebURL.h"
52 #include "public/platform/WebVector.h"
53 #include "public/web/WebFrame.h"
54 #include "public/web/WebPageSerializerClient.h"
55 #include "public/web/WebView.h"
56 #include "web/WebLocalFrameImpl.h"
57 #include "web/WebPageSerializerImpl.h"
58 #include "web/WebViewImpl.h"
59 #include "wtf/Vector.h"
60 #include "wtf/text/StringConcatenate.h"
66 KURL
getSubResourceURLFromElement(Element
* element
)
69 const QualifiedName
& attributeName
= element
->subResourceAttributeName();
70 if (attributeName
== QualifiedName::null())
73 String value
= element
->getAttribute(attributeName
);
74 // Ignore javascript content.
75 if (value
.isEmpty() || value
.stripWhiteSpace().startsWith("javascript:", TextCaseInsensitive
))
78 return element
->document().completeURL(value
);
81 void retrieveResourcesForElement(Element
* element
,
82 Vector
<LocalFrame
*>* visitedFrames
,
83 Vector
<LocalFrame
*>* framesToVisit
,
84 Vector
<KURL
>* frameURLs
,
85 Vector
<KURL
>* resourceURLs
)
88 // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
89 if (isHTMLFrameElementBase(*element
) || isHTMLObjectElement(*element
) || isHTMLEmbedElement(*element
)) {
90 Frame
* frame
= toHTMLFrameOwnerElement(element
)->contentFrame();
91 if (frame
&& frame
->isLocalFrame()) {
92 if (!visitedFrames
->contains(toLocalFrame(frame
)))
93 framesToVisit
->append(toLocalFrame(frame
));
98 KURL url
= getSubResourceURLFromElement(element
);
99 if (url
.isEmpty() || !url
.isValid())
100 return; // No subresource for this node.
102 // Ignore URLs that have a non-standard protocols. Since the FTP protocol
103 // does no have a cache mechanism, we skip it as well.
104 if (!url
.protocolIsInHTTPFamily() && !url
.isLocalFile())
107 if (!resourceURLs
->contains(url
))
108 resourceURLs
->append(url
);
111 void retrieveResourcesForFrame(LocalFrame
* frame
,
112 const WebVector
<WebCString
>& supportedSchemes
,
113 Vector
<LocalFrame
*>* visitedFrames
,
114 Vector
<LocalFrame
*>* framesToVisit
,
115 Vector
<KURL
>* frameURLs
,
116 Vector
<KURL
>* resourceURLs
)
118 KURL frameURL
= frame
->loader().documentLoader()->request().url();
120 // If the frame's URL is invalid, ignore it, it is not retrievable.
121 if (!frameURL
.isValid())
124 // Ignore frames from unsupported schemes.
125 bool isValidScheme
= false;
126 for (size_t i
= 0; i
< supportedSchemes
.size(); ++i
) {
127 if (frameURL
.protocolIs(static_cast<CString
>(supportedSchemes
[i
]).data())) {
128 isValidScheme
= true;
135 // If we have already seen that frame, ignore it.
136 if (visitedFrames
->contains(frame
))
138 visitedFrames
->append(frame
);
139 if (!frameURLs
->contains(frameURL
))
140 frameURLs
->append(frameURL
);
142 // Now get the resources associated with each node of the document.
143 RefPtrWillBeRawPtr
<HTMLAllCollection
> allElements
= frame
->document()->all();
144 for (unsigned i
= 0; i
< allElements
->length(); ++i
) {
145 Element
* element
= allElements
->item(i
);
146 retrieveResourcesForElement(element
,
147 visitedFrames
, framesToVisit
,
148 frameURLs
, resourceURLs
);
152 class MHTMLPageSerializerDelegate final
: public PageSerializer::Delegate
{
154 ~MHTMLPageSerializerDelegate() override
;
155 bool shouldIgnoreAttribute(const Attribute
&) override
;
159 MHTMLPageSerializerDelegate::~MHTMLPageSerializerDelegate()
163 bool MHTMLPageSerializerDelegate::shouldIgnoreAttribute(const Attribute
& attribute
)
165 // TODO(fgorski): Presence of srcset attribute causes MHTML to not display images, as only the value of src
166 // is pulled into the archive. Discarding srcset prevents the problem. Long term we should make sure to MHTML
167 // plays nicely with srcset.
168 return attribute
.localName() == HTMLNames::srcsetAttr
;
173 void WebPageSerializer::serialize(WebView
* view
, WebVector
<WebPageSerializer::Resource
>* resourcesParam
)
175 Vector
<SerializedResource
> resources
;
176 PageSerializer
serializer(&resources
, PassOwnPtr
<PageSerializer::Delegate
>(nullptr));
177 serializer
.serialize(toWebViewImpl(view
)->page());
179 Vector
<Resource
> result
;
180 for (Vector
<SerializedResource
>::const_iterator iter
= resources
.begin(); iter
!= resources
.end(); ++iter
) {
182 resource
.url
= iter
->url
;
183 resource
.mimeType
= iter
->mimeType
.ascii();
184 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
185 resource
.data
= WebCString(iter
->data
->data(), iter
->data
->size());
186 result
.append(resource
);
189 *resourcesParam
= result
;
192 static PassRefPtr
<SharedBuffer
> serializePageToMHTML(Page
* page
, MHTMLArchive::EncodingPolicy encodingPolicy
)
194 Vector
<SerializedResource
> resources
;
195 PageSerializer
serializer(&resources
, adoptPtr(new MHTMLPageSerializerDelegate
));
196 serializer
.serialize(page
);
197 Document
* document
= page
->deprecatedLocalMainFrame()->document();
198 return MHTMLArchive::generateMHTMLData(resources
, encodingPolicy
, document
->title(), document
->suggestedMIMEType());
201 WebCString
WebPageSerializer::serializeToMHTML(WebView
* view
)
203 RefPtr
<SharedBuffer
> mhtml
= serializePageToMHTML(toWebViewImpl(view
)->page(), MHTMLArchive::UseDefaultEncoding
);
204 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
205 return WebCString(mhtml
->data(), mhtml
->size());
208 WebCString
WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView
* view
)
210 RefPtr
<SharedBuffer
> mhtml
= serializePageToMHTML(toWebViewImpl(view
)->page(), MHTMLArchive::UseBinaryEncoding
);
211 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
212 return WebCString(mhtml
->data(), mhtml
->size());
215 bool WebPageSerializer::serialize(WebLocalFrame
* frame
,
217 WebPageSerializerClient
* client
,
218 const WebVector
<WebURL
>& links
,
219 const WebVector
<WebString
>& localPaths
,
220 const WebString
& localDirectoryName
)
222 WebPageSerializerImpl
serializerImpl(
223 frame
, recursive
, client
, links
, localPaths
, localDirectoryName
);
224 return serializerImpl
.serialize();
227 bool WebPageSerializer::retrieveAllResources(WebView
* view
,
228 const WebVector
<WebCString
>& supportedSchemes
,
229 WebVector
<WebURL
>* resourceURLs
,
230 WebVector
<WebURL
>* frameURLs
) {
231 WebLocalFrameImpl
* mainFrame
= toWebLocalFrameImpl(view
->mainFrame());
235 Vector
<LocalFrame
*> framesToVisit
;
236 Vector
<LocalFrame
*> visitedFrames
;
237 Vector
<KURL
> frameKURLs
;
238 Vector
<KURL
> resourceKURLs
;
240 // Let's retrieve the resources from every frame in this page.
241 framesToVisit
.append(mainFrame
->frame());
242 while (!framesToVisit
.isEmpty()) {
243 LocalFrame
* frame
= framesToVisit
[0];
244 framesToVisit
.remove(0);
245 retrieveResourcesForFrame(frame
, supportedSchemes
,
246 &visitedFrames
, &framesToVisit
,
247 &frameKURLs
, &resourceKURLs
);
250 // Converts the results to WebURLs.
251 WebVector
<WebURL
> resultResourceURLs(resourceKURLs
.size());
252 for (size_t i
= 0; i
< resourceKURLs
.size(); ++i
) {
253 resultResourceURLs
[i
] = resourceKURLs
[i
];
254 // A frame's src can point to the same URL as another resource, keep the
255 // resource URL only in such cases.
256 size_t index
= frameKURLs
.find(resourceKURLs
[i
]);
257 if (index
!= kNotFound
)
258 frameKURLs
.remove(index
);
260 *resourceURLs
= resultResourceURLs
;
261 WebVector
<WebURL
> resultFrameURLs(frameKURLs
.size());
262 for (size_t i
= 0; i
< frameKURLs
.size(); ++i
)
263 resultFrameURLs
[i
] = frameKURLs
[i
];
264 *frameURLs
= resultFrameURLs
;
269 WebString
WebPageSerializer::generateMetaCharsetDeclaration(const WebString
& charset
)
271 String charsetString
= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String
&>(charset
) + "\">";
272 return charsetString
;
275 WebString
WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL
& url
)
277 return String::format("\n<!-- saved from url=(%04d)%s -->\n",
278 static_cast<int>(url
.spec().length()),
282 WebString
WebPageSerializer::generateBaseTagDeclaration(const WebString
& baseTarget
)
284 if (baseTarget
.isEmpty())
285 return String("<base href=\".\">");
286 String baseString
= "<base href=\".\" target=\"" + static_cast<const String
&>(baseTarget
) + "\">";