1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/renderer/savable_resources.h"
9 #include "base/compiler_specific.h"
10 #include "base/logging.h"
11 #include "base/strings/string_util.h"
12 #include "third_party/WebKit/public/platform/WebString.h"
13 #include "third_party/WebKit/public/platform/WebVector.h"
14 #include "third_party/WebKit/public/web/WebDocument.h"
15 #include "third_party/WebKit/public/web/WebElement.h"
16 #include "third_party/WebKit/public/web/WebElementCollection.h"
17 #include "third_party/WebKit/public/web/WebInputElement.h"
18 #include "third_party/WebKit/public/web/WebLocalFrame.h"
19 #include "third_party/WebKit/public/web/WebNode.h"
20 #include "third_party/WebKit/public/web/WebNodeList.h"
21 #include "third_party/WebKit/public/web/WebView.h"
23 using blink::WebDocument
;
24 using blink::WebElement
;
25 using blink::WebElementCollection
;
26 using blink::WebFrame
;
27 using blink::WebInputElement
;
28 using blink::WebLocalFrame
;
30 using blink::WebNodeList
;
31 using blink::WebString
;
32 using blink::WebVector
;
38 // Structure for storage the unique set of all savable resource links for
39 // making sure that no duplicated resource link in final result. The consumer
40 // of the SavableResourcesUniqueCheck is responsible for keeping these pointers
41 // valid for the lifetime of the SavableResourcesUniqueCheck instance.
42 struct SavableResourcesUniqueCheck
{
43 // Unique set of all sub resource links.
44 std::set
<GURL
>* resources_set
;
45 // Unique set of all frame links.
46 std::set
<GURL
>* frames_set
;
47 // Collection of all frames we go through when getting all savable resource
49 std::vector
<WebFrame
*>* frames
;
51 SavableResourcesUniqueCheck()
52 : resources_set(NULL
),
56 SavableResourcesUniqueCheck(std::set
<GURL
>* resources_set
,
57 std::set
<GURL
>* frames_set
, std::vector
<WebFrame
*>* frames
)
58 : resources_set(resources_set
),
59 frames_set(frames_set
),
63 // Get all savable resource links from current element. One element might
64 // have more than one resource link. It is possible to have some links
65 // in one CSS stylesheet.
66 void GetSavableResourceLinkForElement(
67 const WebElement
& element
,
68 const WebDocument
& current_doc
,
69 SavableResourcesUniqueCheck
* unique_check
,
70 SavableResourcesResult
* result
) {
72 // Handle frame and iframe tag.
73 if (element
.hasHTMLTagName("iframe") ||
74 element
.hasHTMLTagName("frame")) {
75 WebFrame
* sub_frame
= WebLocalFrame::fromFrameOwnerElement(element
);
77 unique_check
->frames
->push_back(sub_frame
);
81 // Check whether the node has sub resource URL or not.
82 WebString value
= GetSubResourceLinkFromElement(element
);
86 GURL u
= current_doc
.completeURL(value
);
90 // Ignore those URLs which are not standard protocols. Because FTP
91 // protocol does no have cache mechanism, we will skip all
92 // sub-resources if they use FTP protocol.
93 if (!u
.SchemeIsHTTPOrHTTPS() && !u
.SchemeIs(url::kFileScheme
))
95 // Ignore duplicated resource link.
96 if (!unique_check
->resources_set
->insert(u
).second
)
98 result
->resources_list
->push_back(u
);
99 // Insert referrer for above new resource link.
100 result
->referrer_urls_list
->push_back(GURL());
101 result
->referrer_policies_list
->push_back(blink::WebReferrerPolicyDefault
);
104 // Get all savable resource links from current WebFrameImpl object pointer.
105 void GetAllSavableResourceLinksForFrame(WebFrame
* current_frame
,
106 SavableResourcesUniqueCheck
* unique_check
,
107 SavableResourcesResult
* result
,
108 const char** savable_schemes
) {
109 // Get current frame's URL.
110 GURL current_frame_url
= current_frame
->document().url();
112 // If url of current frame is invalid, ignore it.
113 if (!current_frame_url
.is_valid())
116 // If url of current frame is not a savable protocol, ignore it.
117 bool is_valid_protocol
= false;
118 for (int i
= 0; savable_schemes
[i
] != NULL
; ++i
) {
119 if (current_frame_url
.SchemeIs(savable_schemes
[i
])) {
120 is_valid_protocol
= true;
124 if (!is_valid_protocol
)
127 // If find same frame we have recorded, ignore it.
128 if (!unique_check
->frames_set
->insert(current_frame_url
).second
)
131 // Get current using document.
132 WebDocument current_doc
= current_frame
->document();
133 // Go through all descent nodes.
134 WebElementCollection all
= current_doc
.all();
135 // Go through all elements in this frame.
136 for (WebElement element
= all
.firstItem(); !element
.isNull();
137 element
= all
.nextItem()) {
138 GetSavableResourceLinkForElement(element
,
147 WebString
GetSubResourceLinkFromElement(const WebElement
& element
) {
148 const char* attribute_name
= NULL
;
149 if (element
.hasHTMLTagName("img") ||
150 element
.hasHTMLTagName("script")) {
151 attribute_name
= "src";
152 } else if (element
.hasHTMLTagName("input")) {
153 const WebInputElement input
= element
.toConst
<WebInputElement
>();
154 if (input
.isImageButton()) {
155 attribute_name
= "src";
157 } else if (element
.hasHTMLTagName("body") ||
158 element
.hasHTMLTagName("table") ||
159 element
.hasHTMLTagName("tr") ||
160 element
.hasHTMLTagName("td")) {
161 attribute_name
= "background";
162 } else if (element
.hasHTMLTagName("blockquote") ||
163 element
.hasHTMLTagName("q") ||
164 element
.hasHTMLTagName("del") ||
165 element
.hasHTMLTagName("ins")) {
166 attribute_name
= "cite";
167 } else if (element
.hasHTMLTagName("link")) {
168 // If the link element is not linked to css, ignore it.
169 if (base::LowerCaseEqualsASCII(
170 base::StringPiece16(element
.getAttribute("type")), "text/css") ||
171 base::LowerCaseEqualsASCII(
172 base::StringPiece16(element
.getAttribute("rel")), "stylesheet")) {
173 // TODO(jnd): Add support for extracting links of sub-resources which
174 // are inside style-sheet such as @import, url(), etc.
175 // See bug: http://b/issue?id=1111667.
176 attribute_name
= "href";
181 WebString value
= element
.getAttribute(WebString::fromUTF8(attribute_name
));
182 // If value has content and not start with "javascript:" then return it,
183 // otherwise return NULL.
184 if (!value
.isNull() && !value
.isEmpty() &&
185 !base::StartsWith(value
.utf8(), "javascript:",
186 base::CompareCase::INSENSITIVE_ASCII
))
192 // Get all savable resource links from current webview, include main
193 // frame and sub-frame
194 bool GetAllSavableResourceLinksForCurrentPage(WebView
* view
,
195 const GURL
& page_url
, SavableResourcesResult
* result
,
196 const char** savable_schemes
) {
197 WebFrame
* main_frame
= view
->mainFrame();
201 std::set
<GURL
> resources_set
;
202 std::set
<GURL
> frames_set
;
203 std::vector
<WebFrame
*> frames
;
204 SavableResourcesUniqueCheck
unique_check(&resources_set
,
208 GURL
main_page_gurl(main_frame
->document().url());
210 // Make sure we are saving same page between embedder and webkit.
211 // If page has being navigated, embedder will get three empty vector,
212 // which will make the saving page job ended.
213 if (page_url
!= main_page_gurl
)
216 // First, process main frame.
217 frames
.push_back(main_frame
);
219 // Check all resource in this page, include sub-frame.
220 for (int i
= 0; i
< static_cast<int>(frames
.size()); ++i
) {
221 // Get current frame's all savable resource links.
222 GetAllSavableResourceLinksForFrame(frames
[i
], &unique_check
, result
,
226 // Since frame's src can also point to sub-resources link, so it is possible
227 // that some URLs in frames_list are also in resources_list. For those
228 // URLs, we will remove it from frame_list, only keep them in resources_list.
229 for (std::set
<GURL
>::iterator it
= frames_set
.begin();
230 it
!= frames_set
.end(); ++it
) {
231 // Append unique frame source to savable frame list.
232 if (resources_set
.find(*it
) == resources_set
.end())
233 result
->frames_list
->push_back(*it
);
239 } // namespace content