1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "chrome/common/importer/imported_bookmark_entry.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "components/search_engines/search_terms_data.h"
18 #include "components/search_engines/template_url.h"
19 #include "net/base/data_url.h"
20 #include "net/base/escape.h"
22 #include "url/url_constants.h"
26 // Fetches the given |attribute| value from the |attribute_list|. Returns true
27 // if successful, and |value| will contain the value.
28 bool GetAttribute(const std::string
& attribute_list
,
29 const std::string
& attribute
,
31 const char kQuote
[] = "\"";
33 size_t begin
= attribute_list
.find(attribute
+ "=" + kQuote
);
34 if (begin
== std::string::npos
)
35 return false; // Can't find the attribute.
37 begin
+= attribute
.size() + 2;
38 size_t end
= begin
+ 1;
40 while (end
< attribute_list
.size()) {
41 if (attribute_list
[end
] == '"' &&
42 attribute_list
[end
- 1] != '\\') {
48 if (end
== attribute_list
.size())
49 return false; // The value is not quoted.
51 *value
= attribute_list
.substr(begin
, end
- begin
);
55 // Given the URL of a page and a favicon data URL, adds an appropriate record
56 // to the given favicon usage vector.
57 void DataURLToFaviconUsage(const GURL
& link_url
,
58 const GURL
& favicon_data
,
59 favicon_base::FaviconUsageDataList
* favicons
) {
60 if (!link_url
.is_valid() || !favicon_data
.is_valid() ||
61 !favicon_data
.SchemeIs(url::kDataScheme
))
64 // Parse the data URL.
65 std::string mime_type
, char_set
, data
;
66 if (!net::DataURL::Parse(favicon_data
, &mime_type
, &char_set
, &data
) ||
70 favicon_base::FaviconUsageData usage
;
71 if (!importer::ReencodeFavicon(
72 reinterpret_cast<const unsigned char*>(&data
[0]),
73 data
.size(), &usage
.png_data
))
74 return; // Unable to decode.
76 // We need to make up a URL for the favicon. We use a version of the page's
77 // URL so that we can be sure it will not collide.
78 usage
.favicon_url
= GURL(std::string("made-up-favicon:") + link_url
.spec());
80 // We only have one URL per favicon for Firefox 2 bookmarks.
81 usage
.urls
.insert(link_url
);
83 favicons
->push_back(usage
);
88 namespace bookmark_html_reader
{
90 void ImportBookmarksFile(
91 const base::Callback
<bool(void)>& cancellation_callback
,
92 const base::Callback
<bool(const GURL
&)>& valid_url_callback
,
93 const base::FilePath
& file_path
,
94 std::vector
<ImportedBookmarkEntry
>* bookmarks
,
95 std::vector
<importer::SearchEngineInfo
>* search_engines
,
96 favicon_base::FaviconUsageDataList
* favicons
) {
98 base::ReadFileToString(file_path
, &content
);
99 std::vector
<std::string
> lines
= base::SplitString(
100 content
, "\n", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
102 base::string16 last_folder
;
103 bool last_folder_on_toolbar
= false;
104 bool last_folder_is_empty
= true;
105 bool has_subfolder
= false;
106 base::Time last_folder_add_date
;
107 std::vector
<base::string16
> path
;
108 size_t toolbar_folder_index
= 0;
109 std::string charset
= "UTF-8"; // If no charset is specified, assume utf-8.
112 (cancellation_callback
.is_null() || !cancellation_callback
.Run());
115 base::TrimString(lines
[i
], " ", &line
);
117 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
118 // separator in Firefox that Chrome does not support. Note that there can be
119 // multiple "<HR>" tags at the beginning of a single line.
120 // See http://crbug.com/257474.
121 static const char kHrTag
[] = "<HR>";
122 while (base::StartsWith(line
, kHrTag
,
123 base::CompareCase::INSENSITIVE_ASCII
)) {
124 line
.erase(0, arraysize(kHrTag
) - 1);
125 base::TrimString(line
, " ", &line
);
128 // Get the encoding of the bookmark file.
129 if (internal::ParseCharsetFromLine(line
, &charset
))
132 // Get the folder name.
133 if (internal::ParseFolderNameFromLine(line
,
136 &last_folder_on_toolbar
,
137 &last_folder_add_date
)) {
141 // Get the bookmark entry.
142 base::string16 title
;
143 base::string16 shortcut
;
146 base::string16 post_data
;
148 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
151 internal::ParseBookmarkFromLine(line
, charset
, &title
,
152 &url
, &favicon
, &shortcut
,
153 &add_date
, &post_data
) ||
154 internal::ParseMinimumBookmarkFromLine(line
, charset
, &title
, &url
);
156 // If bookmark contains a valid replaceable url and a keyword then import
157 // it as search engine.
158 std::string search_engine_url
;
159 if (is_bookmark
&& post_data
.empty() &&
160 CanImportURLAsSearchEngine(url
, &search_engine_url
) &&
162 importer::SearchEngineInfo search_engine_info
;
163 search_engine_info
.url
.assign(base::UTF8ToUTF16(search_engine_url
));
164 search_engine_info
.keyword
= shortcut
;
165 search_engine_info
.display_name
= title
;
166 search_engines
->push_back(search_engine_info
);
171 last_folder_is_empty
= false;
175 (valid_url_callback
.is_null() || valid_url_callback
.Run(url
))) {
176 if (toolbar_folder_index
> path
.size() && !path
.empty()) {
177 NOTREACHED(); // error in parsing.
181 ImportedBookmarkEntry entry
;
182 entry
.creation_time
= add_date
;
186 if (toolbar_folder_index
) {
187 // The toolbar folder should be at the top level.
188 entry
.in_toolbar
= true;
189 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1, path
.end());
191 // Add this bookmark to the list of |bookmarks|.
192 if (!has_subfolder
&& !last_folder
.empty()) {
193 path
.push_back(last_folder
);
196 entry
.path
.assign(path
.begin(), path
.end());
198 bookmarks
->push_back(entry
);
200 // Save the favicon. DataURLToFaviconUsage will handle the case where
201 // there is no favicon.
203 DataURLToFaviconUsage(url
, favicon
, favicons
);
208 // Bookmarks in sub-folder are encapsulated with <DL> tag.
209 if (base::StartsWith(line
, "<DL>", base::CompareCase::INSENSITIVE_ASCII
)) {
210 has_subfolder
= true;
211 if (!last_folder
.empty()) {
212 path
.push_back(last_folder
);
215 if (last_folder_on_toolbar
&& !toolbar_folder_index
)
216 toolbar_folder_index
= path
.size();
218 // Mark next folder empty as initial state.
219 last_folder_is_empty
= true;
220 } else if (base::StartsWith(line
, "</DL>",
221 base::CompareCase::INSENSITIVE_ASCII
)) {
223 break; // Mismatch <DL>.
225 base::string16 folder_title
= path
.back();
228 if (last_folder_is_empty
) {
229 // Empty folder should be added explicitly.
230 ImportedBookmarkEntry entry
;
231 entry
.is_folder
= true;
232 entry
.creation_time
= last_folder_add_date
;
233 entry
.title
= folder_title
;
234 if (toolbar_folder_index
) {
235 // The toolbar folder should be at the top level.
236 // Make sure we don't add the toolbar folder itself if it is empty.
237 if (toolbar_folder_index
<= path
.size()) {
238 entry
.in_toolbar
= true;
239 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1,
241 bookmarks
->push_back(entry
);
244 // Add this folder to the list of |bookmarks|.
245 entry
.path
.assign(path
.begin(), path
.end());
246 bookmarks
->push_back(entry
);
249 // Parent folder include current one, so it's not empty.
250 last_folder_is_empty
= false;
253 if (toolbar_folder_index
> path
.size())
254 toolbar_folder_index
= 0;
259 bool CanImportURLAsSearchEngine(const GURL
& url
,
260 std::string
* search_engine_url
) {
261 std::string url_spec
= url
.possibly_invalid_spec();
263 if (url_spec
.empty())
266 url_spec
= net::UnescapeURLComponent(url_spec
,
267 net::UnescapeRule::URL_SPECIAL_CHARS
);
269 // Replace replacement terms ("%s") in |url_spec| with {searchTerms}.
271 TemplateURLRef::DisplayURLToURLRef(base::UTF8ToUTF16(url_spec
));
273 TemplateURLData data
;
274 data
.SetURL(url_spec
);
275 *search_engine_url
= url_spec
;
276 return TemplateURL(data
).SupportsReplacement(SearchTermsData());
281 bool ParseCharsetFromLine(const std::string
& line
, std::string
* charset
) {
282 const char kCharset
[] = "charset=";
283 if (base::StartsWith(line
, "<META", base::CompareCase::INSENSITIVE_ASCII
) &&
284 (line
.find("CONTENT=\"") != std::string::npos
||
285 line
.find("content=\"") != std::string::npos
)) {
286 size_t begin
= line
.find(kCharset
);
287 if (begin
== std::string::npos
)
289 begin
+= std::string(kCharset
).size();
290 size_t end
= line
.find_first_of('\"', begin
);
291 *charset
= line
.substr(begin
, end
- begin
);
297 bool ParseFolderNameFromLine(const std::string
& line
,
298 const std::string
& charset
,
299 base::string16
* folder_name
,
300 bool* is_toolbar_folder
,
301 base::Time
* add_date
) {
302 const char kFolderOpen
[] = "<DT><H3";
303 const char kFolderClose
[] = "</H3>";
304 const char kToolbarFolderAttribute
[] = "PERSONAL_TOOLBAR_FOLDER";
305 const char kAddDateAttribute
[] = "ADD_DATE";
307 if (!base::StartsWith(line
, kFolderOpen
, base::CompareCase::SENSITIVE
))
310 size_t end
= line
.find(kFolderClose
);
311 size_t tag_end
= line
.rfind('>', end
) + 1;
312 // If no end tag or start tag is broken, we skip to find the folder name.
313 if (end
== std::string::npos
|| tag_end
< arraysize(kFolderOpen
))
316 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
317 base::OnStringConversionError::SKIP
, folder_name
);
318 *folder_name
= net::UnescapeForHTML(*folder_name
);
320 std::string attribute_list
= line
.substr(arraysize(kFolderOpen
),
321 tag_end
- arraysize(kFolderOpen
) - 1);
325 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
327 base::StringToInt64(value
, &time
);
328 // Upper bound it at 32 bits.
329 if (0 < time
&& time
< (1LL << 32))
330 *add_date
= base::Time::FromTimeT(time
);
333 if (GetAttribute(attribute_list
, kToolbarFolderAttribute
, &value
) &&
334 base::LowerCaseEqualsASCII(value
, "true"))
335 *is_toolbar_folder
= true;
337 *is_toolbar_folder
= false;
342 bool ParseBookmarkFromLine(const std::string
& line
,
343 const std::string
& charset
,
344 base::string16
* title
,
347 base::string16
* shortcut
,
348 base::Time
* add_date
,
349 base::string16
* post_data
) {
350 const char kItemOpen
[] = "<DT><A";
351 const char kItemClose
[] = "</A>";
352 const char kFeedURLAttribute
[] = "FEEDURL";
353 const char kHrefAttribute
[] = "HREF";
354 const char kIconAttribute
[] = "ICON";
355 const char kShortcutURLAttribute
[] = "SHORTCUTURL";
356 const char kAddDateAttribute
[] = "ADD_DATE";
357 const char kPostDataAttribute
[] = "POST_DATA";
364 *add_date
= base::Time();
366 if (!base::StartsWith(line
, kItemOpen
, base::CompareCase::SENSITIVE
))
369 size_t end
= line
.find(kItemClose
);
370 size_t tag_end
= line
.rfind('>', end
) + 1;
371 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
372 return false; // No end tag or start tag is broken.
374 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
375 tag_end
- arraysize(kItemOpen
) - 1);
377 // We don't import Live Bookmark folders, which is Firefox's RSS reading
378 // feature, since the user never necessarily bookmarked them and we don't
379 // have this feature to update their contents.
381 if (GetAttribute(attribute_list
, kFeedURLAttribute
, &value
))
385 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
386 base::OnStringConversionError::SKIP
, title
);
387 *title
= net::UnescapeForHTML(*title
);
390 if (GetAttribute(attribute_list
, kHrefAttribute
, &value
)) {
391 base::string16 url16
;
392 base::CodepageToUTF16(value
, charset
.c_str(),
393 base::OnStringConversionError::SKIP
, &url16
);
394 url16
= net::UnescapeForHTML(url16
);
400 if (GetAttribute(attribute_list
, kIconAttribute
, &value
))
401 *favicon
= GURL(value
);
404 if (GetAttribute(attribute_list
, kShortcutURLAttribute
, &value
)) {
405 base::CodepageToUTF16(value
, charset
.c_str(),
406 base::OnStringConversionError::SKIP
, shortcut
);
407 *shortcut
= net::UnescapeForHTML(*shortcut
);
411 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
413 base::StringToInt64(value
, &time
);
414 // Upper bound it at 32 bits.
415 if (0 < time
&& time
< (1LL << 32))
416 *add_date
= base::Time::FromTimeT(time
);
420 if (GetAttribute(attribute_list
, kPostDataAttribute
, &value
)) {
421 base::CodepageToUTF16(value
, charset
.c_str(),
422 base::OnStringConversionError::SKIP
, post_data
);
423 *post_data
= net::UnescapeForHTML(*post_data
);
429 bool ParseMinimumBookmarkFromLine(const std::string
& line
,
430 const std::string
& charset
,
431 base::string16
* title
,
433 const char kItemOpen
[] = "<DT><A";
434 const char kItemClose
[] = "</";
435 const char kHrefAttributeUpper
[] = "HREF";
436 const char kHrefAttributeLower
[] = "href";
441 // Case-insensitive check of open tag.
442 if (!base::StartsWith(line
, kItemOpen
, base::CompareCase::INSENSITIVE_ASCII
))
445 // Find any close tag.
446 size_t end
= line
.find(kItemClose
);
447 size_t tag_end
= line
.rfind('>', end
) + 1;
448 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
449 return false; // No end tag or start tag is broken.
451 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
452 tag_end
- arraysize(kItemOpen
) - 1);
455 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
456 base::OnStringConversionError::SKIP
, title
);
457 *title
= net::UnescapeForHTML(*title
);
461 if (GetAttribute(attribute_list
, kHrefAttributeUpper
, &value
) ||
462 GetAttribute(attribute_list
, kHrefAttributeLower
, &value
)) {
463 if (charset
.length() != 0) {
464 base::string16 url16
;
465 base::CodepageToUTF16(value
, charset
.c_str(),
466 base::OnStringConversionError::SKIP
, &url16
);
467 url16
= net::UnescapeForHTML(url16
);
478 } // namespace internal
480 } // namespace bookmark_html_reader