Fix broken path in extensions/common/PRESUBMIT.py
[chromium-blink-merge.git] / chrome / utility / importer / bookmark_html_reader.cc
blob2ba117152feaaeb611d46a51671262101fcff680
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "chrome/common/importer/imported_bookmark_entry.h"
16 #include "chrome/utility/importer/favicon_reencode.h"
17 #include "components/search_engines/search_terms_data.h"
18 #include "components/search_engines/template_url.h"
19 #include "net/base/data_url.h"
20 #include "net/base/escape.h"
21 #include "url/gurl.h"
22 #include "url/url_constants.h"
24 namespace {
26 // Fetches the given |attribute| value from the |attribute_list|. Returns true
27 // if successful, and |value| will contain the value.
28 bool GetAttribute(const std::string& attribute_list,
29 const std::string& attribute,
30 std::string* value) {
31 const char kQuote[] = "\"";
33 size_t begin = attribute_list.find(attribute + "=" + kQuote);
34 if (begin == std::string::npos)
35 return false; // Can't find the attribute.
37 begin += attribute.size() + 2;
38 size_t end = begin + 1;
40 while (end < attribute_list.size()) {
41 if (attribute_list[end] == '"' &&
42 attribute_list[end - 1] != '\\') {
43 break;
45 end++;
48 if (end == attribute_list.size())
49 return false; // The value is not quoted.
51 *value = attribute_list.substr(begin, end - begin);
52 return true;
55 // Given the URL of a page and a favicon data URL, adds an appropriate record
56 // to the given favicon usage vector.
57 void DataURLToFaviconUsage(const GURL& link_url,
58 const GURL& favicon_data,
59 favicon_base::FaviconUsageDataList* favicons) {
60 if (!link_url.is_valid() || !favicon_data.is_valid() ||
61 !favicon_data.SchemeIs(url::kDataScheme))
62 return;
64 // Parse the data URL.
65 std::string mime_type, char_set, data;
66 if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
67 data.empty())
68 return;
70 favicon_base::FaviconUsageData usage;
71 if (!importer::ReencodeFavicon(
72 reinterpret_cast<const unsigned char*>(&data[0]),
73 data.size(), &usage.png_data))
74 return; // Unable to decode.
76 // We need to make up a URL for the favicon. We use a version of the page's
77 // URL so that we can be sure it will not collide.
78 usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
80 // We only have one URL per favicon for Firefox 2 bookmarks.
81 usage.urls.insert(link_url);
83 favicons->push_back(usage);
86 } // namespace
88 namespace bookmark_html_reader {
90 void ImportBookmarksFile(
91 const base::Callback<bool(void)>& cancellation_callback,
92 const base::Callback<bool(const GURL&)>& valid_url_callback,
93 const base::FilePath& file_path,
94 std::vector<ImportedBookmarkEntry>* bookmarks,
95 std::vector<importer::SearchEngineInfo>* search_engines,
96 favicon_base::FaviconUsageDataList* favicons) {
97 std::string content;
98 base::ReadFileToString(file_path, &content);
99 std::vector<std::string> lines;
100 base::SplitString(content, '\n', &lines);
102 base::string16 last_folder;
103 bool last_folder_on_toolbar = false;
104 bool last_folder_is_empty = true;
105 bool has_subfolder = false;
106 base::Time last_folder_add_date;
107 std::vector<base::string16> path;
108 size_t toolbar_folder_index = 0;
109 std::string charset;
110 for (size_t i = 0;
111 i < lines.size() &&
112 (cancellation_callback.is_null() || !cancellation_callback.Run());
113 ++i) {
114 std::string line;
115 base::TrimString(lines[i], " ", &line);
117 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
118 // separator in Firefox that Chrome does not support. Note that there can be
119 // multiple "<HR>" tags at the beginning of a single line.
120 // See http://crbug.com/257474.
121 static const char kHrTag[] = "<HR>";
122 while (StartsWithASCII(line, kHrTag, false)) {
123 line.erase(0, arraysize(kHrTag) - 1);
124 base::TrimString(line, " ", &line);
127 // Get the encoding of the bookmark file.
128 if (internal::ParseCharsetFromLine(line, &charset))
129 continue;
131 // Get the folder name.
132 if (internal::ParseFolderNameFromLine(line,
133 charset,
134 &last_folder,
135 &last_folder_on_toolbar,
136 &last_folder_add_date)) {
137 continue;
140 // Get the bookmark entry.
141 base::string16 title;
142 base::string16 shortcut;
143 GURL url, favicon;
144 base::Time add_date;
145 base::string16 post_data;
146 bool is_bookmark;
147 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
148 // keywords yet.
149 is_bookmark =
150 internal::ParseBookmarkFromLine(line, charset, &title,
151 &url, &favicon, &shortcut,
152 &add_date, &post_data) ||
153 internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
155 // If bookmark contains a valid replaceable url and a keyword then import
156 // it as search engine.
157 std::string search_engine_url;
158 if (is_bookmark && post_data.empty() &&
159 CanImportURLAsSearchEngine(url, &search_engine_url) &&
160 !shortcut.empty()) {
161 importer::SearchEngineInfo search_engine_info;
162 search_engine_info.url.assign(base::UTF8ToUTF16(search_engine_url));
163 search_engine_info.keyword = shortcut;
164 search_engine_info.display_name = title;
165 search_engines->push_back(search_engine_info);
166 continue;
169 if (is_bookmark)
170 last_folder_is_empty = false;
172 if (is_bookmark &&
173 post_data.empty() &&
174 (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
175 if (toolbar_folder_index > path.size() && !path.empty()) {
176 NOTREACHED(); // error in parsing.
177 break;
180 ImportedBookmarkEntry entry;
181 entry.creation_time = add_date;
182 entry.url = url;
183 entry.title = title;
185 if (toolbar_folder_index) {
186 // The toolbar folder should be at the top level.
187 entry.in_toolbar = true;
188 entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
189 } else {
190 // Add this bookmark to the list of |bookmarks|.
191 if (!has_subfolder && !last_folder.empty()) {
192 path.push_back(last_folder);
193 last_folder.clear();
195 entry.path.assign(path.begin(), path.end());
197 bookmarks->push_back(entry);
199 // Save the favicon. DataURLToFaviconUsage will handle the case where
200 // there is no favicon.
201 if (favicons)
202 DataURLToFaviconUsage(url, favicon, favicons);
204 continue;
207 // Bookmarks in sub-folder are encapsulated with <DL> tag.
208 if (StartsWithASCII(line, "<DL>", false)) {
209 has_subfolder = true;
210 if (!last_folder.empty()) {
211 path.push_back(last_folder);
212 last_folder.clear();
214 if (last_folder_on_toolbar && !toolbar_folder_index)
215 toolbar_folder_index = path.size();
217 // Mark next folder empty as initial state.
218 last_folder_is_empty = true;
219 } else if (StartsWithASCII(line, "</DL>", false)) {
220 if (path.empty())
221 break; // Mismatch <DL>.
223 base::string16 folder_title = path.back();
224 path.pop_back();
226 if (last_folder_is_empty) {
227 // Empty folder should be added explicitly.
228 ImportedBookmarkEntry entry;
229 entry.is_folder = true;
230 entry.creation_time = last_folder_add_date;
231 entry.title = folder_title;
232 if (toolbar_folder_index) {
233 // The toolbar folder should be at the top level.
234 // Make sure we don't add the toolbar folder itself if it is empty.
235 if (toolbar_folder_index <= path.size()) {
236 entry.in_toolbar = true;
237 entry.path.assign(path.begin() + toolbar_folder_index - 1,
238 path.end());
239 bookmarks->push_back(entry);
241 } else {
242 // Add this folder to the list of |bookmarks|.
243 entry.path.assign(path.begin(), path.end());
244 bookmarks->push_back(entry);
247 // Parent folder include current one, so it's not empty.
248 last_folder_is_empty = false;
251 if (toolbar_folder_index > path.size())
252 toolbar_folder_index = 0;
257 bool CanImportURLAsSearchEngine(const GURL& url,
258 std::string* search_engine_url) {
259 std::string url_spec = url.possibly_invalid_spec();
261 if (url_spec.empty())
262 return false;
264 url_spec = net::UnescapeURLComponent(url_spec,
265 net::UnescapeRule::URL_SPECIAL_CHARS);
267 // Replace replacement terms ("%s") in |url_spec| with {searchTerms}.
268 url_spec =
269 TemplateURLRef::DisplayURLToURLRef(base::UTF8ToUTF16(url_spec));
271 TemplateURLData data;
272 data.SetURL(url_spec);
273 *search_engine_url = url_spec;
274 return TemplateURL(data).SupportsReplacement(SearchTermsData());
277 namespace internal {
279 bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
280 const char kCharset[] = "charset=";
281 if (StartsWithASCII(line, "<META", false) &&
282 (line.find("CONTENT=\"") != std::string::npos ||
283 line.find("content=\"") != std::string::npos)) {
284 size_t begin = line.find(kCharset);
285 if (begin == std::string::npos)
286 return false;
287 begin += std::string(kCharset).size();
288 size_t end = line.find_first_of('\"', begin);
289 *charset = line.substr(begin, end - begin);
290 return true;
292 return false;
295 bool ParseFolderNameFromLine(const std::string& line,
296 const std::string& charset,
297 base::string16* folder_name,
298 bool* is_toolbar_folder,
299 base::Time* add_date) {
300 const char kFolderOpen[] = "<DT><H3";
301 const char kFolderClose[] = "</H3>";
302 const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
303 const char kAddDateAttribute[] = "ADD_DATE";
305 if (!StartsWithASCII(line, kFolderOpen, true))
306 return false;
308 size_t end = line.find(kFolderClose);
309 size_t tag_end = line.rfind('>', end) + 1;
310 // If no end tag or start tag is broken, we skip to find the folder name.
311 if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
312 return false;
314 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
315 base::OnStringConversionError::SKIP, folder_name);
316 *folder_name = net::UnescapeForHTML(*folder_name);
318 std::string attribute_list = line.substr(arraysize(kFolderOpen),
319 tag_end - arraysize(kFolderOpen) - 1);
320 std::string value;
322 // Add date
323 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
324 int64 time;
325 base::StringToInt64(value, &time);
326 // Upper bound it at 32 bits.
327 if (0 < time && time < (1LL << 32))
328 *add_date = base::Time::FromTimeT(time);
331 if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
332 LowerCaseEqualsASCII(value, "true"))
333 *is_toolbar_folder = true;
334 else
335 *is_toolbar_folder = false;
337 return true;
340 bool ParseBookmarkFromLine(const std::string& line,
341 const std::string& charset,
342 base::string16* title,
343 GURL* url,
344 GURL* favicon,
345 base::string16* shortcut,
346 base::Time* add_date,
347 base::string16* post_data) {
348 const char kItemOpen[] = "<DT><A";
349 const char kItemClose[] = "</A>";
350 const char kFeedURLAttribute[] = "FEEDURL";
351 const char kHrefAttribute[] = "HREF";
352 const char kIconAttribute[] = "ICON";
353 const char kShortcutURLAttribute[] = "SHORTCUTURL";
354 const char kAddDateAttribute[] = "ADD_DATE";
355 const char kPostDataAttribute[] = "POST_DATA";
357 title->clear();
358 *url = GURL();
359 *favicon = GURL();
360 shortcut->clear();
361 post_data->clear();
362 *add_date = base::Time();
364 if (!StartsWithASCII(line, kItemOpen, true))
365 return false;
367 size_t end = line.find(kItemClose);
368 size_t tag_end = line.rfind('>', end) + 1;
369 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
370 return false; // No end tag or start tag is broken.
372 std::string attribute_list = line.substr(arraysize(kItemOpen),
373 tag_end - arraysize(kItemOpen) - 1);
375 // We don't import Live Bookmark folders, which is Firefox's RSS reading
376 // feature, since the user never necessarily bookmarked them and we don't
377 // have this feature to update their contents.
378 std::string value;
379 if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
380 return false;
382 // Title
383 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
384 base::OnStringConversionError::SKIP, title);
385 *title = net::UnescapeForHTML(*title);
387 // URL
388 if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
389 base::string16 url16;
390 base::CodepageToUTF16(value, charset.c_str(),
391 base::OnStringConversionError::SKIP, &url16);
392 url16 = net::UnescapeForHTML(url16);
394 *url = GURL(url16);
397 // Favicon
398 if (GetAttribute(attribute_list, kIconAttribute, &value))
399 *favicon = GURL(value);
401 // Keyword
402 if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
403 base::CodepageToUTF16(value, charset.c_str(),
404 base::OnStringConversionError::SKIP, shortcut);
405 *shortcut = net::UnescapeForHTML(*shortcut);
408 // Add date
409 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
410 int64 time;
411 base::StringToInt64(value, &time);
412 // Upper bound it at 32 bits.
413 if (0 < time && time < (1LL << 32))
414 *add_date = base::Time::FromTimeT(time);
417 // Post data.
418 if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
419 base::CodepageToUTF16(value, charset.c_str(),
420 base::OnStringConversionError::SKIP, post_data);
421 *post_data = net::UnescapeForHTML(*post_data);
424 return true;
427 bool ParseMinimumBookmarkFromLine(const std::string& line,
428 const std::string& charset,
429 base::string16* title,
430 GURL* url) {
431 const char kItemOpen[] = "<DT><A";
432 const char kItemClose[] = "</";
433 const char kHrefAttributeUpper[] = "HREF";
434 const char kHrefAttributeLower[] = "href";
436 title->clear();
437 *url = GURL();
439 // Case-insensitive check of open tag.
440 if (!StartsWithASCII(line, kItemOpen, false))
441 return false;
443 // Find any close tag.
444 size_t end = line.find(kItemClose);
445 size_t tag_end = line.rfind('>', end) + 1;
446 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
447 return false; // No end tag or start tag is broken.
449 std::string attribute_list = line.substr(arraysize(kItemOpen),
450 tag_end - arraysize(kItemOpen) - 1);
452 // Title
453 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
454 base::OnStringConversionError::SKIP, title);
455 *title = net::UnescapeForHTML(*title);
457 // URL
458 std::string value;
459 if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
460 GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
461 if (charset.length() != 0) {
462 base::string16 url16;
463 base::CodepageToUTF16(value, charset.c_str(),
464 base::OnStringConversionError::SKIP, &url16);
465 url16 = net::UnescapeForHTML(url16);
467 *url = GURL(url16);
468 } else {
469 *url = GURL(value);
473 return true;
476 } // namespace internal
478 } // namespace bookmark_html_reader