Revert "Only store leading 13 bits of password hash."
[chromium-blink-merge.git] / chrome / utility / importer / bookmark_html_reader.cc
blob8ee67f1bf20e3078f06a4ee774955ffe374cb416
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/utility/importer/bookmark_html_reader.h"
7 #include "base/callback.h"
8 #include "base/files/file_util.h"
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h"
15 #include "chrome/common/importer/imported_bookmark_entry.h"
16 #include "chrome/common/importer/imported_favicon_usage.h"
17 #include "chrome/utility/importer/favicon_reencode.h"
18 #include "components/search_engines/search_terms_data.h"
19 #include "components/search_engines/template_url.h"
20 #include "net/base/data_url.h"
21 #include "net/base/escape.h"
22 #include "url/gurl.h"
23 #include "url/url_constants.h"
25 namespace {
27 // Fetches the given |attribute| value from the |attribute_list|. Returns true
28 // if successful, and |value| will contain the value.
29 bool GetAttribute(const std::string& attribute_list,
30 const std::string& attribute,
31 std::string* value) {
32 const char kQuote[] = "\"";
34 size_t begin = attribute_list.find(attribute + "=" + kQuote);
35 if (begin == std::string::npos)
36 return false; // Can't find the attribute.
38 begin += attribute.size() + 2;
39 size_t end = begin + 1;
41 while (end < attribute_list.size()) {
42 if (attribute_list[end] == '"' &&
43 attribute_list[end - 1] != '\\') {
44 break;
46 end++;
49 if (end == attribute_list.size())
50 return false; // The value is not quoted.
52 *value = attribute_list.substr(begin, end - begin);
53 return true;
56 // Given the URL of a page and a favicon data URL, adds an appropriate record
57 // to the given favicon usage vector.
58 void DataURLToFaviconUsage(
59 const GURL& link_url,
60 const GURL& favicon_data,
61 std::vector<ImportedFaviconUsage>* favicons) {
62 if (!link_url.is_valid() || !favicon_data.is_valid() ||
63 !favicon_data.SchemeIs(url::kDataScheme))
64 return;
66 // Parse the data URL.
67 std::string mime_type, char_set, data;
68 if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
69 data.empty())
70 return;
72 ImportedFaviconUsage usage;
73 if (!importer::ReencodeFavicon(
74 reinterpret_cast<const unsigned char*>(&data[0]),
75 data.size(), &usage.png_data))
76 return; // Unable to decode.
78 // We need to make up a URL for the favicon. We use a version of the page's
79 // URL so that we can be sure it will not collide.
80 usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
82 // We only have one URL per favicon for Firefox 2 bookmarks.
83 usage.urls.insert(link_url);
85 favicons->push_back(usage);
88 } // namespace
90 namespace bookmark_html_reader {
92 void ImportBookmarksFile(
93 const base::Callback<bool(void)>& cancellation_callback,
94 const base::Callback<bool(const GURL&)>& valid_url_callback,
95 const base::FilePath& file_path,
96 std::vector<ImportedBookmarkEntry>* bookmarks,
97 std::vector<importer::SearchEngineInfo>* search_engines,
98 std::vector<ImportedFaviconUsage>* favicons) {
99 std::string content;
100 base::ReadFileToString(file_path, &content);
101 std::vector<std::string> lines;
102 base::SplitString(content, '\n', &lines);
104 base::string16 last_folder;
105 bool last_folder_on_toolbar = false;
106 bool last_folder_is_empty = true;
107 bool has_subfolder = false;
108 base::Time last_folder_add_date;
109 std::vector<base::string16> path;
110 size_t toolbar_folder_index = 0;
111 std::string charset;
112 for (size_t i = 0;
113 i < lines.size() &&
114 (cancellation_callback.is_null() || !cancellation_callback.Run());
115 ++i) {
116 std::string line;
117 base::TrimString(lines[i], " ", &line);
119 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
120 // separator in Firefox that Chrome does not support. Note that there can be
121 // multiple "<HR>" tags at the beginning of a single line.
122 // See http://crbug.com/257474.
123 static const char kHrTag[] = "<HR>";
124 while (StartsWithASCII(line, kHrTag, false)) {
125 line.erase(0, arraysize(kHrTag) - 1);
126 base::TrimString(line, " ", &line);
129 // Get the encoding of the bookmark file.
130 if (internal::ParseCharsetFromLine(line, &charset))
131 continue;
133 // Get the folder name.
134 if (internal::ParseFolderNameFromLine(line,
135 charset,
136 &last_folder,
137 &last_folder_on_toolbar,
138 &last_folder_add_date)) {
139 continue;
142 // Get the bookmark entry.
143 base::string16 title;
144 base::string16 shortcut;
145 GURL url, favicon;
146 base::Time add_date;
147 base::string16 post_data;
148 bool is_bookmark;
149 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
150 // keywords yet.
151 is_bookmark =
152 internal::ParseBookmarkFromLine(line, charset, &title,
153 &url, &favicon, &shortcut,
154 &add_date, &post_data) ||
155 internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
157 // If bookmark contains a valid replaceable url and a keyword then import
158 // it as search engine.
159 std::string search_engine_url;
160 if (is_bookmark && post_data.empty() &&
161 CanImportURLAsSearchEngine(url, &search_engine_url) &&
162 !shortcut.empty()) {
163 importer::SearchEngineInfo search_engine_info;
164 search_engine_info.url.assign(base::UTF8ToUTF16(search_engine_url));
165 search_engine_info.keyword = shortcut;
166 search_engine_info.display_name = title;
167 search_engines->push_back(search_engine_info);
168 continue;
171 if (is_bookmark)
172 last_folder_is_empty = false;
174 if (is_bookmark &&
175 post_data.empty() &&
176 (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
177 if (toolbar_folder_index > path.size() && !path.empty()) {
178 NOTREACHED(); // error in parsing.
179 break;
182 ImportedBookmarkEntry entry;
183 entry.creation_time = add_date;
184 entry.url = url;
185 entry.title = title;
187 if (toolbar_folder_index) {
188 // The toolbar folder should be at the top level.
189 entry.in_toolbar = true;
190 entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
191 } else {
192 // Add this bookmark to the list of |bookmarks|.
193 if (!has_subfolder && !last_folder.empty()) {
194 path.push_back(last_folder);
195 last_folder.clear();
197 entry.path.assign(path.begin(), path.end());
199 bookmarks->push_back(entry);
201 // Save the favicon. DataURLToFaviconUsage will handle the case where
202 // there is no favicon.
203 if (favicons)
204 DataURLToFaviconUsage(url, favicon, favicons);
206 continue;
209 // Bookmarks in sub-folder are encapsulated with <DL> tag.
210 if (StartsWithASCII(line, "<DL>", false)) {
211 has_subfolder = true;
212 if (!last_folder.empty()) {
213 path.push_back(last_folder);
214 last_folder.clear();
216 if (last_folder_on_toolbar && !toolbar_folder_index)
217 toolbar_folder_index = path.size();
219 // Mark next folder empty as initial state.
220 last_folder_is_empty = true;
221 } else if (StartsWithASCII(line, "</DL>", false)) {
222 if (path.empty())
223 break; // Mismatch <DL>.
225 base::string16 folder_title = path.back();
226 path.pop_back();
228 if (last_folder_is_empty) {
229 // Empty folder should be added explicitly.
230 ImportedBookmarkEntry entry;
231 entry.is_folder = true;
232 entry.creation_time = last_folder_add_date;
233 entry.title = folder_title;
234 if (toolbar_folder_index) {
235 // The toolbar folder should be at the top level.
236 // Make sure we don't add the toolbar folder itself if it is empty.
237 if (toolbar_folder_index <= path.size()) {
238 entry.in_toolbar = true;
239 entry.path.assign(path.begin() + toolbar_folder_index - 1,
240 path.end());
241 bookmarks->push_back(entry);
243 } else {
244 // Add this folder to the list of |bookmarks|.
245 entry.path.assign(path.begin(), path.end());
246 bookmarks->push_back(entry);
249 // Parent folder include current one, so it's not empty.
250 last_folder_is_empty = false;
253 if (toolbar_folder_index > path.size())
254 toolbar_folder_index = 0;
259 bool CanImportURLAsSearchEngine(const GURL& url,
260 std::string* search_engine_url) {
261 std::string url_spec = url.possibly_invalid_spec();
263 if (url_spec.empty())
264 return false;
266 url_spec = net::UnescapeURLComponent(url_spec,
267 net::UnescapeRule::URL_SPECIAL_CHARS);
269 // Replace replacement terms ("%s") in |url_spec| with {searchTerms}.
270 url_spec =
271 TemplateURLRef::DisplayURLToURLRef(base::UTF8ToUTF16(url_spec));
273 TemplateURLData data;
274 data.SetURL(url_spec);
275 *search_engine_url = url_spec;
276 return TemplateURL(data).SupportsReplacement(SearchTermsData());
279 namespace internal {
281 bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
282 const char kCharset[] = "charset=";
283 if (StartsWithASCII(line, "<META", false) &&
284 (line.find("CONTENT=\"") != std::string::npos ||
285 line.find("content=\"") != std::string::npos)) {
286 size_t begin = line.find(kCharset);
287 if (begin == std::string::npos)
288 return false;
289 begin += std::string(kCharset).size();
290 size_t end = line.find_first_of('\"', begin);
291 *charset = line.substr(begin, end - begin);
292 return true;
294 return false;
297 bool ParseFolderNameFromLine(const std::string& line,
298 const std::string& charset,
299 base::string16* folder_name,
300 bool* is_toolbar_folder,
301 base::Time* add_date) {
302 const char kFolderOpen[] = "<DT><H3";
303 const char kFolderClose[] = "</H3>";
304 const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
305 const char kAddDateAttribute[] = "ADD_DATE";
307 if (!StartsWithASCII(line, kFolderOpen, true))
308 return false;
310 size_t end = line.find(kFolderClose);
311 size_t tag_end = line.rfind('>', end) + 1;
312 // If no end tag or start tag is broken, we skip to find the folder name.
313 if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
314 return false;
316 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
317 base::OnStringConversionError::SKIP, folder_name);
318 *folder_name = net::UnescapeForHTML(*folder_name);
320 std::string attribute_list = line.substr(arraysize(kFolderOpen),
321 tag_end - arraysize(kFolderOpen) - 1);
322 std::string value;
324 // Add date
325 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
326 int64 time;
327 base::StringToInt64(value, &time);
328 // Upper bound it at 32 bits.
329 if (0 < time && time < (1LL << 32))
330 *add_date = base::Time::FromTimeT(time);
333 if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
334 LowerCaseEqualsASCII(value, "true"))
335 *is_toolbar_folder = true;
336 else
337 *is_toolbar_folder = false;
339 return true;
342 bool ParseBookmarkFromLine(const std::string& line,
343 const std::string& charset,
344 base::string16* title,
345 GURL* url,
346 GURL* favicon,
347 base::string16* shortcut,
348 base::Time* add_date,
349 base::string16* post_data) {
350 const char kItemOpen[] = "<DT><A";
351 const char kItemClose[] = "</A>";
352 const char kFeedURLAttribute[] = "FEEDURL";
353 const char kHrefAttribute[] = "HREF";
354 const char kIconAttribute[] = "ICON";
355 const char kShortcutURLAttribute[] = "SHORTCUTURL";
356 const char kAddDateAttribute[] = "ADD_DATE";
357 const char kPostDataAttribute[] = "POST_DATA";
359 title->clear();
360 *url = GURL();
361 *favicon = GURL();
362 shortcut->clear();
363 post_data->clear();
364 *add_date = base::Time();
366 if (!StartsWithASCII(line, kItemOpen, true))
367 return false;
369 size_t end = line.find(kItemClose);
370 size_t tag_end = line.rfind('>', end) + 1;
371 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
372 return false; // No end tag or start tag is broken.
374 std::string attribute_list = line.substr(arraysize(kItemOpen),
375 tag_end - arraysize(kItemOpen) - 1);
377 // We don't import Live Bookmark folders, which is Firefox's RSS reading
378 // feature, since the user never necessarily bookmarked them and we don't
379 // have this feature to update their contents.
380 std::string value;
381 if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
382 return false;
384 // Title
385 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
386 base::OnStringConversionError::SKIP, title);
387 *title = net::UnescapeForHTML(*title);
389 // URL
390 if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
391 base::string16 url16;
392 base::CodepageToUTF16(value, charset.c_str(),
393 base::OnStringConversionError::SKIP, &url16);
394 url16 = net::UnescapeForHTML(url16);
396 *url = GURL(url16);
399 // Favicon
400 if (GetAttribute(attribute_list, kIconAttribute, &value))
401 *favicon = GURL(value);
403 // Keyword
404 if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
405 base::CodepageToUTF16(value, charset.c_str(),
406 base::OnStringConversionError::SKIP, shortcut);
407 *shortcut = net::UnescapeForHTML(*shortcut);
410 // Add date
411 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
412 int64 time;
413 base::StringToInt64(value, &time);
414 // Upper bound it at 32 bits.
415 if (0 < time && time < (1LL << 32))
416 *add_date = base::Time::FromTimeT(time);
419 // Post data.
420 if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
421 base::CodepageToUTF16(value, charset.c_str(),
422 base::OnStringConversionError::SKIP, post_data);
423 *post_data = net::UnescapeForHTML(*post_data);
426 return true;
429 bool ParseMinimumBookmarkFromLine(const std::string& line,
430 const std::string& charset,
431 base::string16* title,
432 GURL* url) {
433 const char kItemOpen[] = "<DT><A";
434 const char kItemClose[] = "</";
435 const char kHrefAttributeUpper[] = "HREF";
436 const char kHrefAttributeLower[] = "href";
438 title->clear();
439 *url = GURL();
441 // Case-insensitive check of open tag.
442 if (!StartsWithASCII(line, kItemOpen, false))
443 return false;
445 // Find any close tag.
446 size_t end = line.find(kItemClose);
447 size_t tag_end = line.rfind('>', end) + 1;
448 if (end == std::string::npos || tag_end < arraysize(kItemOpen))
449 return false; // No end tag or start tag is broken.
451 std::string attribute_list = line.substr(arraysize(kItemOpen),
452 tag_end - arraysize(kItemOpen) - 1);
454 // Title
455 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
456 base::OnStringConversionError::SKIP, title);
457 *title = net::UnescapeForHTML(*title);
459 // URL
460 std::string value;
461 if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
462 GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
463 if (charset.length() != 0) {
464 base::string16 url16;
465 base::CodepageToUTF16(value, charset.c_str(),
466 base::OnStringConversionError::SKIP, &url16);
467 url16 = net::UnescapeForHTML(url16);
469 *url = GURL(url16);
470 } else {
471 *url = GURL(value);
475 return true;
478 } // namespace internal
480 } // namespace bookmark_html_reader