1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/time/time.h"
14 #include "chrome/common/importer/imported_bookmark_entry.h"
15 #include "chrome/common/importer/imported_favicon_usage.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "content/public/common/url_constants.h"
18 #include "net/base/data_url.h"
19 #include "net/base/escape.h"
24 // Fetches the given |attribute| value from the |attribute_list|. Returns true
25 // if successful, and |value| will contain the value.
26 bool GetAttribute(const std::string
& attribute_list
,
27 const std::string
& attribute
,
29 const char kQuote
[] = "\"";
31 size_t begin
= attribute_list
.find(attribute
+ "=" + kQuote
);
32 if (begin
== std::string::npos
)
33 return false; // Can't find the attribute.
35 begin
+= attribute
.size() + 2;
36 size_t end
= begin
+ 1;
38 while (end
< attribute_list
.size()) {
39 if (attribute_list
[end
] == '"' &&
40 attribute_list
[end
- 1] != '\\') {
46 if (end
== attribute_list
.size())
47 return false; // The value is not quoted.
49 *value
= attribute_list
.substr(begin
, end
- begin
);
53 // Given the URL of a page and a favicon data URL, adds an appropriate record
54 // to the given favicon usage vector.
55 void DataURLToFaviconUsage(
57 const GURL
& favicon_data
,
58 std::vector
<ImportedFaviconUsage
>* favicons
) {
59 if (!link_url
.is_valid() || !favicon_data
.is_valid() ||
60 !favicon_data
.SchemeIs(chrome::kDataScheme
))
63 // Parse the data URL.
64 std::string mime_type
, char_set
, data
;
65 if (!net::DataURL::Parse(favicon_data
, &mime_type
, &char_set
, &data
) ||
69 ImportedFaviconUsage usage
;
70 if (!importer::ReencodeFavicon(
71 reinterpret_cast<const unsigned char*>(&data
[0]),
72 data
.size(), &usage
.png_data
))
73 return; // Unable to decode.
75 // We need to make up a URL for the favicon. We use a version of the page's
76 // URL so that we can be sure it will not collide.
77 usage
.favicon_url
= GURL(std::string("made-up-favicon:") + link_url
.spec());
79 // We only have one URL per favicon for Firefox 2 bookmarks.
80 usage
.urls
.insert(link_url
);
82 favicons
->push_back(usage
);
87 namespace bookmark_html_reader
{
89 void ImportBookmarksFile(
90 const base::Callback
<bool(void)>& cancellation_callback
,
91 const base::Callback
<bool(const GURL
&)>& valid_url_callback
,
92 const base::FilePath
& file_path
,
93 std::vector
<ImportedBookmarkEntry
>* bookmarks
,
94 std::vector
<ImportedFaviconUsage
>* favicons
) {
96 base::ReadFileToString(file_path
, &content
);
97 std::vector
<std::string
> lines
;
98 base::SplitString(content
, '\n', &lines
);
100 base::string16 last_folder
;
101 bool last_folder_on_toolbar
= false;
102 bool last_folder_is_empty
= true;
103 bool has_subfolder
= false;
104 base::Time last_folder_add_date
;
105 std::vector
<base::string16
> path
;
106 size_t toolbar_folder_index
= 0;
110 (cancellation_callback
.is_null() || !cancellation_callback
.Run());
113 base::TrimString(lines
[i
], " ", &line
);
115 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
116 // separator in Firefox that Chrome does not support. Note that there can be
117 // multiple "<HR>" tags at the beginning of a single line.
118 // See http://crbug.com/257474.
119 static const char kHrTag
[] = "<HR>";
120 while (StartsWithASCII(line
, kHrTag
, false)) {
121 line
.erase(0, arraysize(kHrTag
) - 1);
122 base::TrimString(line
, " ", &line
);
125 // Get the encoding of the bookmark file.
126 if (internal::ParseCharsetFromLine(line
, &charset
))
129 // Get the folder name.
130 if (internal::ParseFolderNameFromLine(line
,
133 &last_folder_on_toolbar
,
134 &last_folder_add_date
)) {
138 // Get the bookmark entry.
139 base::string16 title
;
140 base::string16 shortcut
;
143 base::string16 post_data
;
145 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
148 internal::ParseBookmarkFromLine(line
, charset
, &title
,
149 &url
, &favicon
, &shortcut
,
150 &add_date
, &post_data
) ||
151 internal::ParseMinimumBookmarkFromLine(line
, charset
, &title
, &url
);
154 last_folder_is_empty
= false;
158 (valid_url_callback
.is_null() || valid_url_callback
.Run(url
))) {
159 if (toolbar_folder_index
> path
.size() && !path
.empty()) {
160 NOTREACHED(); // error in parsing.
164 ImportedBookmarkEntry entry
;
165 entry
.creation_time
= add_date
;
169 if (toolbar_folder_index
) {
170 // The toolbar folder should be at the top level.
171 entry
.in_toolbar
= true;
172 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1, path
.end());
174 // Add this bookmark to the list of |bookmarks|.
175 if (!has_subfolder
&& !last_folder
.empty()) {
176 path
.push_back(last_folder
);
179 entry
.path
.assign(path
.begin(), path
.end());
181 bookmarks
->push_back(entry
);
183 // Save the favicon. DataURLToFaviconUsage will handle the case where
184 // there is no favicon.
186 DataURLToFaviconUsage(url
, favicon
, favicons
);
191 // Bookmarks in sub-folder are encapsulated with <DL> tag.
192 if (StartsWithASCII(line
, "<DL>", false)) {
193 has_subfolder
= true;
194 if (!last_folder
.empty()) {
195 path
.push_back(last_folder
);
198 if (last_folder_on_toolbar
&& !toolbar_folder_index
)
199 toolbar_folder_index
= path
.size();
201 // Mark next folder empty as initial state.
202 last_folder_is_empty
= true;
203 } else if (StartsWithASCII(line
, "</DL>", false)) {
205 break; // Mismatch <DL>.
207 base::string16 folder_title
= path
.back();
210 if (last_folder_is_empty
) {
211 // Empty folder should be added explicitly.
212 ImportedBookmarkEntry entry
;
213 entry
.is_folder
= true;
214 entry
.creation_time
= last_folder_add_date
;
215 entry
.title
= folder_title
;
216 if (toolbar_folder_index
) {
217 // The toolbar folder should be at the top level.
218 // Make sure we don't add the toolbar folder itself if it is empty.
219 if (toolbar_folder_index
<= path
.size()) {
220 entry
.in_toolbar
= true;
221 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1,
223 bookmarks
->push_back(entry
);
226 // Add this folder to the list of |bookmarks|.
227 entry
.path
.assign(path
.begin(), path
.end());
228 bookmarks
->push_back(entry
);
231 // Parent folder include current one, so it's not empty.
232 last_folder_is_empty
= false;
235 if (toolbar_folder_index
> path
.size())
236 toolbar_folder_index
= 0;
243 bool ParseCharsetFromLine(const std::string
& line
, std::string
* charset
) {
244 const char kCharset
[] = "charset=";
245 if (StartsWithASCII(line
, "<META", false) &&
246 (line
.find("CONTENT=\"") != std::string::npos
||
247 line
.find("content=\"") != std::string::npos
)) {
248 size_t begin
= line
.find(kCharset
);
249 if (begin
== std::string::npos
)
251 begin
+= std::string(kCharset
).size();
252 size_t end
= line
.find_first_of('\"', begin
);
253 *charset
= line
.substr(begin
, end
- begin
);
259 bool ParseFolderNameFromLine(const std::string
& line
,
260 const std::string
& charset
,
261 base::string16
* folder_name
,
262 bool* is_toolbar_folder
,
263 base::Time
* add_date
) {
264 const char kFolderOpen
[] = "<DT><H3";
265 const char kFolderClose
[] = "</H3>";
266 const char kToolbarFolderAttribute
[] = "PERSONAL_TOOLBAR_FOLDER";
267 const char kAddDateAttribute
[] = "ADD_DATE";
269 if (!StartsWithASCII(line
, kFolderOpen
, true))
272 size_t end
= line
.find(kFolderClose
);
273 size_t tag_end
= line
.rfind('>', end
) + 1;
274 // If no end tag or start tag is broken, we skip to find the folder name.
275 if (end
== std::string::npos
|| tag_end
< arraysize(kFolderOpen
))
278 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
279 base::OnStringConversionError::SKIP
, folder_name
);
280 *folder_name
= net::UnescapeForHTML(*folder_name
);
282 std::string attribute_list
= line
.substr(arraysize(kFolderOpen
),
283 tag_end
- arraysize(kFolderOpen
) - 1);
287 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
289 base::StringToInt64(value
, &time
);
290 // Upper bound it at 32 bits.
291 if (0 < time
&& time
< (1LL << 32))
292 *add_date
= base::Time::FromTimeT(time
);
295 if (GetAttribute(attribute_list
, kToolbarFolderAttribute
, &value
) &&
296 LowerCaseEqualsASCII(value
, "true"))
297 *is_toolbar_folder
= true;
299 *is_toolbar_folder
= false;
304 bool ParseBookmarkFromLine(const std::string
& line
,
305 const std::string
& charset
,
306 base::string16
* title
,
309 base::string16
* shortcut
,
310 base::Time
* add_date
,
311 base::string16
* post_data
) {
312 const char kItemOpen
[] = "<DT><A";
313 const char kItemClose
[] = "</A>";
314 const char kFeedURLAttribute
[] = "FEEDURL";
315 const char kHrefAttribute
[] = "HREF";
316 const char kIconAttribute
[] = "ICON";
317 const char kShortcutURLAttribute
[] = "SHORTCUTURL";
318 const char kAddDateAttribute
[] = "ADD_DATE";
319 const char kPostDataAttribute
[] = "POST_DATA";
326 *add_date
= base::Time();
328 if (!StartsWithASCII(line
, kItemOpen
, true))
331 size_t end
= line
.find(kItemClose
);
332 size_t tag_end
= line
.rfind('>', end
) + 1;
333 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
334 return false; // No end tag or start tag is broken.
336 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
337 tag_end
- arraysize(kItemOpen
) - 1);
339 // We don't import Live Bookmark folders, which is Firefox's RSS reading
340 // feature, since the user never necessarily bookmarked them and we don't
341 // have this feature to update their contents.
343 if (GetAttribute(attribute_list
, kFeedURLAttribute
, &value
))
347 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
348 base::OnStringConversionError::SKIP
, title
);
349 *title
= net::UnescapeForHTML(*title
);
352 if (GetAttribute(attribute_list
, kHrefAttribute
, &value
)) {
353 base::string16 url16
;
354 base::CodepageToUTF16(value
, charset
.c_str(),
355 base::OnStringConversionError::SKIP
, &url16
);
356 url16
= net::UnescapeForHTML(url16
);
362 if (GetAttribute(attribute_list
, kIconAttribute
, &value
))
363 *favicon
= GURL(value
);
366 if (GetAttribute(attribute_list
, kShortcutURLAttribute
, &value
)) {
367 base::CodepageToUTF16(value
, charset
.c_str(),
368 base::OnStringConversionError::SKIP
, shortcut
);
369 *shortcut
= net::UnescapeForHTML(*shortcut
);
373 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
375 base::StringToInt64(value
, &time
);
376 // Upper bound it at 32 bits.
377 if (0 < time
&& time
< (1LL << 32))
378 *add_date
= base::Time::FromTimeT(time
);
382 if (GetAttribute(attribute_list
, kPostDataAttribute
, &value
)) {
383 base::CodepageToUTF16(value
, charset
.c_str(),
384 base::OnStringConversionError::SKIP
, post_data
);
385 *post_data
= net::UnescapeForHTML(*post_data
);
391 bool ParseMinimumBookmarkFromLine(const std::string
& line
,
392 const std::string
& charset
,
393 base::string16
* title
,
395 const char kItemOpen
[] = "<DT><A";
396 const char kItemClose
[] = "</";
397 const char kHrefAttributeUpper
[] = "HREF";
398 const char kHrefAttributeLower
[] = "href";
403 // Case-insensitive check of open tag.
404 if (!StartsWithASCII(line
, kItemOpen
, false))
407 // Find any close tag.
408 size_t end
= line
.find(kItemClose
);
409 size_t tag_end
= line
.rfind('>', end
) + 1;
410 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
411 return false; // No end tag or start tag is broken.
413 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
414 tag_end
- arraysize(kItemOpen
) - 1);
417 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
418 base::OnStringConversionError::SKIP
, title
);
419 *title
= net::UnescapeForHTML(*title
);
423 if (GetAttribute(attribute_list
, kHrefAttributeUpper
, &value
) ||
424 GetAttribute(attribute_list
, kHrefAttributeLower
, &value
)) {
425 if (charset
.length() != 0) {
426 base::string16 url16
;
427 base::CodepageToUTF16(value
, charset
.c_str(),
428 base::OnStringConversionError::SKIP
, &url16
);
429 url16
= net::UnescapeForHTML(url16
);
440 } // namespace internal
442 } // namespace bookmark_html_reader