1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "chrome/common/importer/imported_bookmark_entry.h"
16 #include "chrome/common/importer/imported_favicon_usage.h"
17 #include "chrome/utility/importer/favicon_reencode.h"
18 #include "components/search_engines/search_terms_data.h"
19 #include "components/search_engines/template_url.h"
20 #include "net/base/data_url.h"
21 #include "net/base/escape.h"
23 #include "url/url_constants.h"
27 // Fetches the given |attribute| value from the |attribute_list|. Returns true
28 // if successful, and |value| will contain the value.
29 bool GetAttribute(const std::string
& attribute_list
,
30 const std::string
& attribute
,
32 const char kQuote
[] = "\"";
34 size_t begin
= attribute_list
.find(attribute
+ "=" + kQuote
);
35 if (begin
== std::string::npos
)
36 return false; // Can't find the attribute.
38 begin
+= attribute
.size() + 2;
39 size_t end
= begin
+ 1;
41 while (end
< attribute_list
.size()) {
42 if (attribute_list
[end
] == '"' &&
43 attribute_list
[end
- 1] != '\\') {
49 if (end
== attribute_list
.size())
50 return false; // The value is not quoted.
52 *value
= attribute_list
.substr(begin
, end
- begin
);
56 // Given the URL of a page and a favicon data URL, adds an appropriate record
57 // to the given favicon usage vector.
58 void DataURLToFaviconUsage(
60 const GURL
& favicon_data
,
61 std::vector
<ImportedFaviconUsage
>* favicons
) {
62 if (!link_url
.is_valid() || !favicon_data
.is_valid() ||
63 !favicon_data
.SchemeIs(url::kDataScheme
))
66 // Parse the data URL.
67 std::string mime_type
, char_set
, data
;
68 if (!net::DataURL::Parse(favicon_data
, &mime_type
, &char_set
, &data
) ||
72 ImportedFaviconUsage usage
;
73 if (!importer::ReencodeFavicon(
74 reinterpret_cast<const unsigned char*>(&data
[0]),
75 data
.size(), &usage
.png_data
))
76 return; // Unable to decode.
78 // We need to make up a URL for the favicon. We use a version of the page's
79 // URL so that we can be sure it will not collide.
80 usage
.favicon_url
= GURL(std::string("made-up-favicon:") + link_url
.spec());
82 // We only have one URL per favicon for Firefox 2 bookmarks.
83 usage
.urls
.insert(link_url
);
85 favicons
->push_back(usage
);
90 namespace bookmark_html_reader
{
92 void ImportBookmarksFile(
93 const base::Callback
<bool(void)>& cancellation_callback
,
94 const base::Callback
<bool(const GURL
&)>& valid_url_callback
,
95 const base::FilePath
& file_path
,
96 std::vector
<ImportedBookmarkEntry
>* bookmarks
,
97 std::vector
<importer::SearchEngineInfo
>* search_engines
,
98 std::vector
<ImportedFaviconUsage
>* favicons
) {
100 base::ReadFileToString(file_path
, &content
);
101 std::vector
<std::string
> lines
;
102 base::SplitString(content
, '\n', &lines
);
104 base::string16 last_folder
;
105 bool last_folder_on_toolbar
= false;
106 bool last_folder_is_empty
= true;
107 bool has_subfolder
= false;
108 base::Time last_folder_add_date
;
109 std::vector
<base::string16
> path
;
110 size_t toolbar_folder_index
= 0;
114 (cancellation_callback
.is_null() || !cancellation_callback
.Run());
117 base::TrimString(lines
[i
], " ", &line
);
119 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
120 // separator in Firefox that Chrome does not support. Note that there can be
121 // multiple "<HR>" tags at the beginning of a single line.
122 // See http://crbug.com/257474.
123 static const char kHrTag
[] = "<HR>";
124 while (StartsWithASCII(line
, kHrTag
, false)) {
125 line
.erase(0, arraysize(kHrTag
) - 1);
126 base::TrimString(line
, " ", &line
);
129 // Get the encoding of the bookmark file.
130 if (internal::ParseCharsetFromLine(line
, &charset
))
133 // Get the folder name.
134 if (internal::ParseFolderNameFromLine(line
,
137 &last_folder_on_toolbar
,
138 &last_folder_add_date
)) {
142 // Get the bookmark entry.
143 base::string16 title
;
144 base::string16 shortcut
;
147 base::string16 post_data
;
149 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
152 internal::ParseBookmarkFromLine(line
, charset
, &title
,
153 &url
, &favicon
, &shortcut
,
154 &add_date
, &post_data
) ||
155 internal::ParseMinimumBookmarkFromLine(line
, charset
, &title
, &url
);
157 // If bookmark contains a valid replaceable url and a keyword then import
158 // it as search engine.
159 std::string search_engine_url
;
160 if (is_bookmark
&& post_data
.empty() &&
161 CanImportURLAsSearchEngine(url
, &search_engine_url
) &&
163 importer::SearchEngineInfo search_engine_info
;
164 search_engine_info
.url
.assign(base::UTF8ToUTF16(search_engine_url
));
165 search_engine_info
.keyword
= shortcut
;
166 search_engine_info
.display_name
= title
;
167 search_engines
->push_back(search_engine_info
);
172 last_folder_is_empty
= false;
176 (valid_url_callback
.is_null() || valid_url_callback
.Run(url
))) {
177 if (toolbar_folder_index
> path
.size() && !path
.empty()) {
178 NOTREACHED(); // error in parsing.
182 ImportedBookmarkEntry entry
;
183 entry
.creation_time
= add_date
;
187 if (toolbar_folder_index
) {
188 // The toolbar folder should be at the top level.
189 entry
.in_toolbar
= true;
190 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1, path
.end());
192 // Add this bookmark to the list of |bookmarks|.
193 if (!has_subfolder
&& !last_folder
.empty()) {
194 path
.push_back(last_folder
);
197 entry
.path
.assign(path
.begin(), path
.end());
199 bookmarks
->push_back(entry
);
201 // Save the favicon. DataURLToFaviconUsage will handle the case where
202 // there is no favicon.
204 DataURLToFaviconUsage(url
, favicon
, favicons
);
209 // Bookmarks in sub-folder are encapsulated with <DL> tag.
210 if (StartsWithASCII(line
, "<DL>", false)) {
211 has_subfolder
= true;
212 if (!last_folder
.empty()) {
213 path
.push_back(last_folder
);
216 if (last_folder_on_toolbar
&& !toolbar_folder_index
)
217 toolbar_folder_index
= path
.size();
219 // Mark next folder empty as initial state.
220 last_folder_is_empty
= true;
221 } else if (StartsWithASCII(line
, "</DL>", false)) {
223 break; // Mismatch <DL>.
225 base::string16 folder_title
= path
.back();
228 if (last_folder_is_empty
) {
229 // Empty folder should be added explicitly.
230 ImportedBookmarkEntry entry
;
231 entry
.is_folder
= true;
232 entry
.creation_time
= last_folder_add_date
;
233 entry
.title
= folder_title
;
234 if (toolbar_folder_index
) {
235 // The toolbar folder should be at the top level.
236 // Make sure we don't add the toolbar folder itself if it is empty.
237 if (toolbar_folder_index
<= path
.size()) {
238 entry
.in_toolbar
= true;
239 entry
.path
.assign(path
.begin() + toolbar_folder_index
- 1,
241 bookmarks
->push_back(entry
);
244 // Add this folder to the list of |bookmarks|.
245 entry
.path
.assign(path
.begin(), path
.end());
246 bookmarks
->push_back(entry
);
249 // Parent folder include current one, so it's not empty.
250 last_folder_is_empty
= false;
253 if (toolbar_folder_index
> path
.size())
254 toolbar_folder_index
= 0;
259 bool CanImportURLAsSearchEngine(const GURL
& url
,
260 std::string
* search_engine_url
) {
261 std::string url_spec
= url
.possibly_invalid_spec();
263 if (url_spec
.empty())
266 url_spec
= net::UnescapeURLComponent(url_spec
,
267 net::UnescapeRule::URL_SPECIAL_CHARS
);
269 // Replace replacement terms ("%s") in |url_spec| with {searchTerms}.
271 TemplateURLRef::DisplayURLToURLRef(base::UTF8ToUTF16(url_spec
));
273 TemplateURLData data
;
274 data
.SetURL(url_spec
);
275 *search_engine_url
= url_spec
;
276 return TemplateURL(data
).SupportsReplacement(SearchTermsData());
281 bool ParseCharsetFromLine(const std::string
& line
, std::string
* charset
) {
282 const char kCharset
[] = "charset=";
283 if (StartsWithASCII(line
, "<META", false) &&
284 (line
.find("CONTENT=\"") != std::string::npos
||
285 line
.find("content=\"") != std::string::npos
)) {
286 size_t begin
= line
.find(kCharset
);
287 if (begin
== std::string::npos
)
289 begin
+= std::string(kCharset
).size();
290 size_t end
= line
.find_first_of('\"', begin
);
291 *charset
= line
.substr(begin
, end
- begin
);
297 bool ParseFolderNameFromLine(const std::string
& line
,
298 const std::string
& charset
,
299 base::string16
* folder_name
,
300 bool* is_toolbar_folder
,
301 base::Time
* add_date
) {
302 const char kFolderOpen
[] = "<DT><H3";
303 const char kFolderClose
[] = "</H3>";
304 const char kToolbarFolderAttribute
[] = "PERSONAL_TOOLBAR_FOLDER";
305 const char kAddDateAttribute
[] = "ADD_DATE";
307 if (!StartsWithASCII(line
, kFolderOpen
, true))
310 size_t end
= line
.find(kFolderClose
);
311 size_t tag_end
= line
.rfind('>', end
) + 1;
312 // If no end tag or start tag is broken, we skip to find the folder name.
313 if (end
== std::string::npos
|| tag_end
< arraysize(kFolderOpen
))
316 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
317 base::OnStringConversionError::SKIP
, folder_name
);
318 *folder_name
= net::UnescapeForHTML(*folder_name
);
320 std::string attribute_list
= line
.substr(arraysize(kFolderOpen
),
321 tag_end
- arraysize(kFolderOpen
) - 1);
325 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
327 base::StringToInt64(value
, &time
);
328 // Upper bound it at 32 bits.
329 if (0 < time
&& time
< (1LL << 32))
330 *add_date
= base::Time::FromTimeT(time
);
333 if (GetAttribute(attribute_list
, kToolbarFolderAttribute
, &value
) &&
334 LowerCaseEqualsASCII(value
, "true"))
335 *is_toolbar_folder
= true;
337 *is_toolbar_folder
= false;
342 bool ParseBookmarkFromLine(const std::string
& line
,
343 const std::string
& charset
,
344 base::string16
* title
,
347 base::string16
* shortcut
,
348 base::Time
* add_date
,
349 base::string16
* post_data
) {
350 const char kItemOpen
[] = "<DT><A";
351 const char kItemClose
[] = "</A>";
352 const char kFeedURLAttribute
[] = "FEEDURL";
353 const char kHrefAttribute
[] = "HREF";
354 const char kIconAttribute
[] = "ICON";
355 const char kShortcutURLAttribute
[] = "SHORTCUTURL";
356 const char kAddDateAttribute
[] = "ADD_DATE";
357 const char kPostDataAttribute
[] = "POST_DATA";
364 *add_date
= base::Time();
366 if (!StartsWithASCII(line
, kItemOpen
, true))
369 size_t end
= line
.find(kItemClose
);
370 size_t tag_end
= line
.rfind('>', end
) + 1;
371 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
372 return false; // No end tag or start tag is broken.
374 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
375 tag_end
- arraysize(kItemOpen
) - 1);
377 // We don't import Live Bookmark folders, which is Firefox's RSS reading
378 // feature, since the user never necessarily bookmarked them and we don't
379 // have this feature to update their contents.
381 if (GetAttribute(attribute_list
, kFeedURLAttribute
, &value
))
385 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
386 base::OnStringConversionError::SKIP
, title
);
387 *title
= net::UnescapeForHTML(*title
);
390 if (GetAttribute(attribute_list
, kHrefAttribute
, &value
)) {
391 base::string16 url16
;
392 base::CodepageToUTF16(value
, charset
.c_str(),
393 base::OnStringConversionError::SKIP
, &url16
);
394 url16
= net::UnescapeForHTML(url16
);
400 if (GetAttribute(attribute_list
, kIconAttribute
, &value
))
401 *favicon
= GURL(value
);
404 if (GetAttribute(attribute_list
, kShortcutURLAttribute
, &value
)) {
405 base::CodepageToUTF16(value
, charset
.c_str(),
406 base::OnStringConversionError::SKIP
, shortcut
);
407 *shortcut
= net::UnescapeForHTML(*shortcut
);
411 if (GetAttribute(attribute_list
, kAddDateAttribute
, &value
)) {
413 base::StringToInt64(value
, &time
);
414 // Upper bound it at 32 bits.
415 if (0 < time
&& time
< (1LL << 32))
416 *add_date
= base::Time::FromTimeT(time
);
420 if (GetAttribute(attribute_list
, kPostDataAttribute
, &value
)) {
421 base::CodepageToUTF16(value
, charset
.c_str(),
422 base::OnStringConversionError::SKIP
, post_data
);
423 *post_data
= net::UnescapeForHTML(*post_data
);
429 bool ParseMinimumBookmarkFromLine(const std::string
& line
,
430 const std::string
& charset
,
431 base::string16
* title
,
433 const char kItemOpen
[] = "<DT><A";
434 const char kItemClose
[] = "</";
435 const char kHrefAttributeUpper
[] = "HREF";
436 const char kHrefAttributeLower
[] = "href";
441 // Case-insensitive check of open tag.
442 if (!StartsWithASCII(line
, kItemOpen
, false))
445 // Find any close tag.
446 size_t end
= line
.find(kItemClose
);
447 size_t tag_end
= line
.rfind('>', end
) + 1;
448 if (end
== std::string::npos
|| tag_end
< arraysize(kItemOpen
))
449 return false; // No end tag or start tag is broken.
451 std::string attribute_list
= line
.substr(arraysize(kItemOpen
),
452 tag_end
- arraysize(kItemOpen
) - 1);
455 base::CodepageToUTF16(line
.substr(tag_end
, end
- tag_end
), charset
.c_str(),
456 base::OnStringConversionError::SKIP
, title
);
457 *title
= net::UnescapeForHTML(*title
);
461 if (GetAttribute(attribute_list
, kHrefAttributeUpper
, &value
) ||
462 GetAttribute(attribute_list
, kHrefAttributeLower
, &value
)) {
463 if (charset
.length() != 0) {
464 base::string16 url16
;
465 base::CodepageToUTF16(value
, charset
.c_str(),
466 base::OnStringConversionError::SKIP
, &url16
);
467 url16
= net::UnescapeForHTML(url16
);
478 } // namespace internal
480 } // namespace bookmark_html_reader