1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "chrome/common/importer/imported_bookmark_entry.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "components/search_engines/search_terms_data.h"
18 #include "components/search_engines/template_url.h"
19 #include "net/base/data_url.h"
20 #include "net/base/escape.h"
22 #include "url/url_constants.h"
26 // Fetches the given |attribute| value from the |attribute_list|. Returns true
27 // if successful, and |value| will contain the value.
28 bool GetAttribute(const std::string
& attribute_list
,
29 const std::string
& attribute
,
31 const char kQuote
[] = "\"";
33 size_t begin
= attribute_list
.find(attribute
+ "=" + kQuote
);
34 if (begin
== std::string::npos
)
35 return false; // Can't find the attribute.
37 begin
+= attribute
.size() + 2;
38 size_t end
= begin
+ 1;
40 while (end
< attribute_list
.size()) {
41 if (attribute_list
[end
] == '"' &&
42 attribute_list
[end
- 1] != '\\') {
48 if (end
== attribute_list
.size())
49 return false; // The value is not quoted.
51 *value
= attribute_list
.substr(begin
, end
- begin
);
55 // Given the URL of a page and a favicon data URL, adds an appropriate record
56 // to the given favicon usage vector.
57 void DataURLToFaviconUsage(const GURL
& link_url
,
58 const GURL
& favicon_data
,
59 favicon_base::FaviconUsageDataList
* favicons
) {
60 if (!link_url
.is_valid() || !favicon_data
.is_valid() ||
61 !favicon_data
.SchemeIs(url::kDataScheme
))
64 // Parse the data URL.
65 std::string mime_type
, char_set
, data
;
66 if (!net::DataURL::Parse(favicon_data
, &mime_type
, &char_set
, &data
) ||
70 favicon_base::FaviconUsageData usage
;
71 if (!importer::ReencodeFavicon(
72 reinterpret_cast<const unsigned char*>(&data
[0]),
73 data
.size(), &usage
.png_data
))
74 return; // Unable to decode.
76 // We need to make up a URL for the favicon. We use a version of the page's
77 // URL so that we can be sure it will not collide.
78 usage
.favicon_url
= GURL(std::string("made-up-favicon:") + link_url
.spec());
80 // We only have one URL per favicon for Firefox 2 bookmarks.
81 usage
.urls
.insert(link_url
);
83 favicons
->push_back(usage
);
88 namespace bookmark_html_reader
{
90 void ImportBookmarksFile(
91 const base::Callback
<bool(void)>& cancellation_callback
,
92 const base::Callback
<bool(const GURL
&)>& valid_url_callback
,
93 const base::FilePath
& file_path
,
94 std::vector
<ImportedBookmarkEntry
>* bookmarks
,
95 std::vector
<importer::SearchEngineInfo
>* search_engines
,
96 favicon_base::FaviconUsageDataList
* favicons
) {
98 base::ReadFileToString(file_path
, &content
);
99 std::vector
<std::string
> lines
;
100 base::SplitString(content
, '\n', &lines
);
102 base::string16 last_folder
;
103 bool last_folder_on_toolbar
= false;
104 bool last_folder_is_empty
= true;
105 bool has_subfolder
= false;
106 base::Time last_folder_add_date
;
107 std::vector
<base::string16
> path
;
108 size_t toolbar_folder_index
= 0;
112 (cancellation_callback
.is_null() || !cancellation_callback
.Run());
115 base::TrimString(lines
[i
], " ", &line
);
117 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
118 // separator in Firefox that Chrome does not support. Note that there can be
119 // multiple "<HR>" tags at the beginning of a single line.
120 // See http://crbug.com/257474.
121 static const char kHrTag
[] = "<HR>";
122 while (StartsWithASCII(line
, kHrTag
, false)) {
123 line
.erase(0, arraysize(kHrTag
) - 1);
124 base::TrimString(line
, " ", &line
);
127 // Get the encoding of the bookmark file.
128 if (internal::ParseCharsetFromLine(line
, &charset
))
131 // Get the folder name.
132 if (internal::ParseFolderNameFromLine(line
,
135 &last_folder_on_toolbar
,
136 &last_folder_add_date
)) {
140 // Get the bookmark entry.
141 base::string16 title
;
142 base::string16 shortcut
;
145 base::string16 post_data
;
147 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
150 internal::ParseBookmarkFromLine(line
, charset
, &title
,
151 &url
, &favicon
, &shortcut
,
152 &add_date
, &post_data
) ||
153 internal::ParseMinimumBookmarkFromLine(line
, charset
, &title
, &url
);
155 // If bookmark contains a valid replaceable url and a keyword then import
156 // it as search engine.
157 std::string search_engine_url
;
158 if (is_bookmark
&& post_data
.empty() &&
159 CanImportURLAsSearchEngine(url
, &search_engine_url
) &&
161 importer::SearchEngineInfo search_engine_info
;
162 search_engine_info
.url
.assign(base::UTF8ToUTF16(search_engine_url
));
163 search_engine_info
.keyword
= shortcut
;
164 search_engine_info
.display_name
= title
;
165 search_engines
->push_back(search_engine_info
);
170 last_folder_is_empty
= false;
174 (valid_url_callback
.is_null() || valid_url_callback
.Run(url
))) {
175 if (toolbar_folder_index
> path
.size() && !path
.empty()) {
176 NOTREACHED(); // error in parsing.
180 ImportedBookmarkEntry entry
;
181 entry
.creation_time
= add_date
;
185 if (toolbar_folder_index
) {
186 // The toolbar folder should be at the top level.
187 entry
.in_toolbar
= true;
188 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1, path
.end());
190 // Add this bookmark to the list of |bookmarks|.
191 if (!has_subfolder
&& !last_folder
.empty()) {
192 path
.push_back(last_folder
);
195 entry
.path
.assign(path
.begin(), path
.end());
197 bookmarks
->push_back(entry
);
199 // Save the favicon. DataURLToFaviconUsage will handle the case where
200 // there is no favicon.
202 DataURLToFaviconUsage(url
, favicon
, favicons
);
207 // Bookmarks in sub-folder are encapsulated with <DL> tag.
208 if (StartsWithASCII(line
, "<DL>", false)) {
209 has_subfolder
= true;
210 if (!last_folder
.empty()) {
211 path
.push_back(last_folder
);
214 if (last_folder_on_toolbar
&& !toolbar_folder_index
)
215 toolbar_folder_index
= path
.size();
217 // Mark next folder empty as initial state.
218 last_folder_is_empty
= true;
219 } else if (StartsWithASCII(line
, "</DL>", false)) {
221 break; // Mismatch <DL>.
223 base::string16 folder_title
= path
.back();
226 if (last_folder_is_empty
) {
227 // Empty folder should be added explicitly.
228 ImportedBookmarkEntry entry
;
229 entry
.is_folder
= true;
230 entry
.creation_time
= last_folder_add_date
;
231 entry
.title
= folder_title
;
232 if (toolbar_folder_index
) {
233 // The toolbar folder should be at the top level.
234 // Make sure we don't add the toolbar folder itself if it is empty.
235 if (toolbar_folder_index
<= path
.size()) {
236 entry
.in_toolbar
= true;
237 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1,
239 bookmarks
->push_back(entry
);
242 // Add this folder to the list of |bookmarks|.
243 entry
.path
.assign(path
.begin(), path
.end());
244 bookmarks
->push_back(entry
);
247 // Parent folder include current one, so it's not empty.
248 last_folder_is_empty
= false;
251 if (toolbar_folder_index
> path
.size())
252 toolbar_folder_index
= 0;
257 bool CanImportURLAsSearchEngine(const GURL
& url
,
258 std::string
* search_engine_url
) {
259 std::string url_spec
= url
.possibly_invalid_spec();
261 if (url_spec
.empty())
264 url_spec
= net::UnescapeURLComponent(url_spec
,
265 net::UnescapeRule::URL_SPECIAL_CHARS
);
267 // Replace replacement terms ("%s") in |url_spec| with {searchTerms}.
269 TemplateURLRef::DisplayURLToURLRef(base::UTF8ToUTF16(url_spec
));
271 TemplateURLData data
;
272 data
.SetURL(url_spec
);
273 *search_engine_url
= url_spec
;
274 return TemplateURL(data
).SupportsReplacement(SearchTermsData());
279 bool ParseCharsetFromLine(const std::string
& line
, std::string
* charset
) {
280 const char kCharset
[] = "charset=";
281 if (StartsWithASCII(line
, "<META", false) &&
282 (line
.find("CONTENT=\"") != std::string::npos
||
283 line
.find("content=\"") != std::string::npos
)) {
284 size_t begin
= line
.find(kCharset
);
285 if (begin
== std::string::npos
)
287 begin
+= std::string(kCharset
).size();
288 size_t end
= line
.find_first_of('\"', begin
);
289 *charset
= line
.substr(begin
, end
- begin
);
295 bool ParseFolderNameFromLine(const std::string
& line
,
296 const std::string
& charset
,
297 base::string16
* folder_name
,
298 bool* is_toolbar_folder
,
299 base::Time
* add_date
) {
300 const char kFolderOpen
[] = "<DT><H3";
301 const char kFolderClose
[] = "</H3>";
302 const char kToolbarFolderAttribute
[] = "PERSONAL_TOOLBAR_FOLDER";
303 const char kAddDateAttribute
[] = "ADD_DATE";
305 if (!StartsWithASCII(line
, kFolderOpen
, true))
308 size_t end
= line
.find(kFolderClose
);
309 size_t tag_end
= line
.rfind('>', end
) + 1;
310 // If no end tag or start tag is broken, we skip to find the folder name.
311 if (end
== std::string::npos
|| tag_end
< arraysize(kFolderOpen
))
314 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
315 base::OnStringConversionError::SKIP
, folder_name
);
316 *folder_name
= net::UnescapeForHTML(*folder_name
);
318 std::string attribute_list
= line
.substr(arraysize(kFolderOpen
),
319 tag_end
- arraysize(kFolderOpen
) - 1);
323 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
325 base::StringToInt64(value
, &time
);
326 // Upper bound it at 32 bits.
327 if (0 < time
&& time
< (1LL << 32))
328 *add_date
= base::Time::FromTimeT(time
);
331 if (GetAttribute(attribute_list
, kToolbarFolderAttribute
, &value
) &&
332 LowerCaseEqualsASCII(value
, "true"))
333 *is_toolbar_folder
= true;
335 *is_toolbar_folder
= false;
340 bool ParseBookmarkFromLine(const std::string
& line
,
341 const std::string
& charset
,
342 base::string16
* title
,
345 base::string16
* shortcut
,
346 base::Time
* add_date
,
347 base::string16
* post_data
) {
348 const char kItemOpen
[] = "<DT><A";
349 const char kItemClose
[] = "</A>";
350 const char kFeedURLAttribute
[] = "FEEDURL";
351 const char kHrefAttribute
[] = "HREF";
352 const char kIconAttribute
[] = "ICON";
353 const char kShortcutURLAttribute
[] = "SHORTCUTURL";
354 const char kAddDateAttribute
[] = "ADD_DATE";
355 const char kPostDataAttribute
[] = "POST_DATA";
362 *add_date
= base::Time();
364 if (!StartsWithASCII(line
, kItemOpen
, true))
367 size_t end
= line
.find(kItemClose
);
368 size_t tag_end
= line
.rfind('>', end
) + 1;
369 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
370 return false; // No end tag or start tag is broken.
372 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
373 tag_end
- arraysize(kItemOpen
) - 1);
375 // We don't import Live Bookmark folders, which is Firefox's RSS reading
376 // feature, since the user never necessarily bookmarked them and we don't
377 // have this feature to update their contents.
379 if (GetAttribute(attribute_list
, kFeedURLAttribute
, &value
))
383 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
384 base::OnStringConversionError::SKIP
, title
);
385 *title
= net::UnescapeForHTML(*title
);
388 if (GetAttribute(attribute_list
, kHrefAttribute
, &value
)) {
389 base::string16 url16
;
390 base::CodepageToUTF16(value
, charset
.c_str(),
391 base::OnStringConversionError::SKIP
, &url16
);
392 url16
= net::UnescapeForHTML(url16
);
398 if (GetAttribute(attribute_list
, kIconAttribute
, &value
))
399 *favicon
= GURL(value
);
402 if (GetAttribute(attribute_list
, kShortcutURLAttribute
, &value
)) {
403 base::CodepageToUTF16(value
, charset
.c_str(),
404 base::OnStringConversionError::SKIP
, shortcut
);
405 *shortcut
= net::UnescapeForHTML(*shortcut
);
409 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
411 base::StringToInt64(value
, &time
);
412 // Upper bound it at 32 bits.
413 if (0 < time
&& time
< (1LL << 32))
414 *add_date
= base::Time::FromTimeT(time
);
418 if (GetAttribute(attribute_list
, kPostDataAttribute
, &value
)) {
419 base::CodepageToUTF16(value
, charset
.c_str(),
420 base::OnStringConversionError::SKIP
, post_data
);
421 *post_data
= net::UnescapeForHTML(*post_data
);
427 bool ParseMinimumBookmarkFromLine(const std::string
& line
,
428 const std::string
& charset
,
429 base::string16
* title
,
431 const char kItemOpen
[] = "<DT><A";
432 const char kItemClose
[] = "</";
433 const char kHrefAttributeUpper
[] = "HREF";
434 const char kHrefAttributeLower
[] = "href";
439 // Case-insensitive check of open tag.
440 if (!StartsWithASCII(line
, kItemOpen
, false))
443 // Find any close tag.
444 size_t end
= line
.find(kItemClose
);
445 size_t tag_end
= line
.rfind('>', end
) + 1;
446 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
447 return false; // No end tag or start tag is broken.
449 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
450 tag_end
- arraysize(kItemOpen
) - 1);
453 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
454 base::OnStringConversionError::SKIP
, title
);
455 *title
= net::UnescapeForHTML(*title
);
459 if (GetAttribute(attribute_list
, kHrefAttributeUpper
, &value
) ||
460 GetAttribute(attribute_list
, kHrefAttributeLower
, &value
)) {
461 if (charset
.length() != 0) {
462 base::string16 url16
;
463 base::CodepageToUTF16(value
, charset
.c_str(),
464 base::OnStringConversionError::SKIP
, &url16
);
465 url16
= net::UnescapeForHTML(url16
);
476 } // namespace internal
478 } // namespace bookmark_html_reader