Remove stray platform_file.h from chrome and components
[chromium-blink-merge.git] / chrome / browser / devtools / devtools_file_system_indexer.cc
blobedeeca108b1fa67de8d071edd5581fb427c43984
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
7 #include <iterator>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/file_util.h"
12 #include "base/files/file_enumerator.h"
13 #include "base/files/file_util_proxy.h"
14 #include "base/lazy_instance.h"
15 #include "base/logging.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "content/public/browser/browser_thread.h"
19 using base::Bind;
20 using base::Callback;
21 using base::FileEnumerator;
22 using base::FilePath;
23 using base::Time;
24 using base::TimeDelta;
25 using base::TimeTicks;
26 using content::BrowserThread;
27 using std::map;
28 using std::set;
29 using std::string;
30 using std::vector;
32 namespace {
34 typedef int32 Trigram;
35 typedef char TrigramChar;
36 typedef uint16 FileId;
38 const int kMinTimeoutBetweenWorkedNitification = 200;
39 // Trigram characters include all ASCII printable characters (32-126) except for
40 // the capital letters, because the index is case insensitive.
41 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
42 const size_t kTrigramCount =
43 kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
44 const int kMaxReadLength = 10 * 1024;
45 const TrigramChar kUndefinedTrigramChar = -1;
46 const Trigram kUndefinedTrigram = -1;
48 base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
49 LAZY_INSTANCE_INITIALIZER;
51 base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
52 LAZY_INSTANCE_INITIALIZER;
54 class Index {
55 public:
56 Index();
57 Time LastModifiedTimeForFile(const FilePath& file_path);
58 void SetTrigramsForFile(const FilePath& file_path,
59 const vector<Trigram>& index,
60 const Time& time);
61 vector<FilePath> Search(string query);
62 void PrintStats();
63 void NormalizeVectors();
65 private:
66 ~Index();
68 FileId GetFileId(const FilePath& file_path);
70 typedef map<FilePath, FileId> FileIdsMap;
71 FileIdsMap file_ids_;
72 FileId last_file_id_;
73 // The index in this vector is the trigram id.
74 vector<vector<FileId> > index_;
75 typedef map<FilePath, Time> IndexedFilesMap;
76 IndexedFilesMap index_times_;
77 vector<bool> is_normalized_;
79 DISALLOW_COPY_AND_ASSIGN(Index);
82 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
84 void InitIsBinaryCharMap() {
85 for (size_t i = 0; i < 256; ++i) {
86 unsigned char ch = i;
87 bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
88 g_is_binary_char.Get().push_back(is_binary_char);
92 bool IsBinaryChar(char c) {
93 unsigned char uc = static_cast<unsigned char>(c);
94 return g_is_binary_char.Get()[uc];
97 void InitTrigramCharsMap() {
98 for (size_t i = 0; i < 256; ++i) {
99 if (i > 127) {
100 g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
101 continue;
103 char ch = i;
104 if (ch == '\t')
105 ch = ' ';
106 if (ch >= 'A' && ch <= 'Z')
107 ch = ch - 'A' + 'a';
108 if ((IsBinaryChar(ch)) || (ch < ' ')) {
109 g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
110 continue;
113 if (ch >= 'Z')
114 ch = ch - 'Z' - 1 + 'A';
115 ch -= ' ';
116 char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
117 CHECK(ch >= 0 && ch < signed_trigram_char_count);
118 g_trigram_chars.Get().push_back(ch);
122 TrigramChar TrigramCharForChar(char c) {
123 unsigned char uc = static_cast<unsigned char>(c);
124 return g_trigram_chars.Get()[uc];
127 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
128 static int kTrigramCharacterCountSquared =
129 kTrigramCharacterCount * kTrigramCharacterCount;
130 if (trigram_chars[index] == kUndefinedTrigramChar ||
131 trigram_chars[index + 1] == kUndefinedTrigramChar ||
132 trigram_chars[index + 2] == kUndefinedTrigramChar)
133 return kUndefinedTrigram;
134 Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
135 kTrigramCharacterCount * trigram_chars[index + 1] +
136 trigram_chars[index + 2];
137 return trigram;
140 Index::Index() : last_file_id_(0) {
141 index_.resize(kTrigramCount);
142 is_normalized_.resize(kTrigramCount);
143 std::fill(is_normalized_.begin(), is_normalized_.end(), true);
146 Index::~Index() {}
148 Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
149 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
150 Time last_modified_time;
151 if (index_times_.find(file_path) != index_times_.end())
152 last_modified_time = index_times_[file_path];
153 return last_modified_time;
156 void Index::SetTrigramsForFile(const FilePath& file_path,
157 const vector<Trigram>& index,
158 const Time& time) {
159 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
160 FileId file_id = GetFileId(file_path);
161 vector<Trigram>::const_iterator it = index.begin();
162 for (; it != index.end(); ++it) {
163 Trigram trigram = *it;
164 index_[trigram].push_back(file_id);
165 is_normalized_[trigram] = false;
167 index_times_[file_path] = time;
170 vector<FilePath> Index::Search(string query) {
171 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
172 const char* data = query.c_str();
173 vector<TrigramChar> trigram_chars;
174 trigram_chars.reserve(query.size());
175 for (size_t i = 0; i < query.size(); ++i)
176 trigram_chars.push_back(TrigramCharForChar(data[i]));
177 vector<Trigram> trigrams;
178 for (size_t i = 0; i + 2 < query.size(); ++i) {
179 Trigram trigram = TrigramAtIndex(trigram_chars, i);
180 if (trigram != kUndefinedTrigram)
181 trigrams.push_back(trigram);
183 set<FileId> file_ids;
184 bool first = true;
185 vector<Trigram>::const_iterator it = trigrams.begin();
186 for (; it != trigrams.end(); ++it) {
187 Trigram trigram = *it;
188 if (first) {
189 std::copy(index_[trigram].begin(),
190 index_[trigram].end(),
191 std::inserter(file_ids, file_ids.begin()));
192 first = false;
193 continue;
195 set<FileId> intersection;
196 std::set_intersection(file_ids.begin(),
197 file_ids.end(),
198 index_[trigram].begin(),
199 index_[trigram].end(),
200 std::inserter(intersection, intersection.begin()));
201 file_ids.swap(intersection);
203 vector<FilePath> result;
204 FileIdsMap::const_iterator ids_it = file_ids_.begin();
205 for (; ids_it != file_ids_.end(); ++ids_it) {
206 if (trigrams.size() == 0 ||
207 file_ids.find(ids_it->second) != file_ids.end()) {
208 result.push_back(ids_it->first);
211 return result;
214 FileId Index::GetFileId(const FilePath& file_path) {
215 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
216 string file_path_str = file_path.AsUTF8Unsafe();
217 if (file_ids_.find(file_path) != file_ids_.end())
218 return file_ids_[file_path];
219 file_ids_[file_path] = ++last_file_id_;
220 return last_file_id_;
223 void Index::NormalizeVectors() {
224 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
225 for (size_t i = 0; i < kTrigramCount; ++i) {
226 if (!is_normalized_[i]) {
227 std::sort(index_[i].begin(), index_[i].end());
228 if (index_[i].capacity() > index_[i].size())
229 vector<FileId>(index_[i]).swap(index_[i]);
230 is_normalized_[i] = true;
235 void Index::PrintStats() {
236 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
237 LOG(ERROR) << "Index stats:";
238 size_t size = 0;
239 size_t maxSize = 0;
240 size_t capacity = 0;
241 for (size_t i = 0; i < kTrigramCount; ++i) {
242 if (index_[i].size() > maxSize)
243 maxSize = index_[i].size();
244 size += index_[i].size();
245 capacity += index_[i].capacity();
247 LOG(ERROR) << " - total trigram count: " << size;
248 LOG(ERROR) << " - max file count per trigram: " << maxSize;
249 LOG(ERROR) << " - total vectors capacity " << capacity;
250 size_t total_index_size =
251 capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
252 LOG(ERROR) << " - estimated total index size " << total_index_size;
255 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
257 } // namespace
259 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
260 const FilePath& file_system_path,
261 const TotalWorkCallback& total_work_callback,
262 const WorkedCallback& worked_callback,
263 const DoneCallback& done_callback)
264 : file_system_path_(file_system_path),
265 total_work_callback_(total_work_callback),
266 worked_callback_(worked_callback),
267 done_callback_(done_callback),
268 current_file_(BrowserThread::GetMessageLoopProxyForThread(
269 BrowserThread::FILE).get()),
270 files_indexed_(0),
271 stopped_(false) {
272 current_trigrams_set_.resize(kTrigramCount);
273 current_trigrams_.reserve(kTrigramCount);
276 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
278 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
279 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
280 BrowserThread::PostTask(
281 BrowserThread::FILE,
282 FROM_HERE,
283 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
286 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
287 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
288 BrowserThread::PostTask(BrowserThread::FILE,
289 FROM_HERE,
290 Bind(&FileSystemIndexingJob::StopOnFileThread, this));
293 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
294 stopped_ = true;
297 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
298 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
299 if (stopped_)
300 return;
301 if (!file_enumerator_) {
302 file_enumerator_.reset(
303 new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
305 FilePath file_path = file_enumerator_->Next();
306 if (file_path.empty()) {
307 BrowserThread::PostTask(
308 BrowserThread::UI,
309 FROM_HERE,
310 Bind(total_work_callback_, file_path_times_.size()));
311 indexing_it_ = file_path_times_.begin();
312 IndexFiles();
313 return;
315 Time saved_last_modified_time =
316 g_trigram_index.Get().LastModifiedTimeForFile(file_path);
317 FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
318 Time current_last_modified_time = file_info.GetLastModifiedTime();
319 if (current_last_modified_time > saved_last_modified_time) {
320 file_path_times_[file_path] = current_last_modified_time;
322 BrowserThread::PostTask(
323 BrowserThread::FILE,
324 FROM_HERE,
325 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
328 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
329 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
330 if (stopped_)
331 return;
332 if (indexing_it_ == file_path_times_.end()) {
333 g_trigram_index.Get().NormalizeVectors();
334 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
335 return;
337 FilePath file_path = indexing_it_->first;
338 current_file_.CreateOrOpen(
339 file_path,
340 base::File::FLAG_OPEN | base::File::FLAG_READ,
341 Bind(&FileSystemIndexingJob::StartFileIndexing, this));
344 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
345 base::File::Error error) {
346 if (!current_file_.IsValid()) {
347 FinishFileIndexing(false);
348 return;
350 current_file_offset_ = 0;
351 current_trigrams_.clear();
352 std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
353 ReadFromFile();
356 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
357 if (stopped_) {
358 CloseFile();
359 return;
361 current_file_.Read(current_file_offset_, kMaxReadLength,
362 Bind(&FileSystemIndexingJob::OnRead, this));
365 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
366 base::File::Error error,
367 const char* data,
368 int bytes_read) {
369 if (error != base::File::FILE_OK) {
370 FinishFileIndexing(false);
371 return;
374 if (!bytes_read || bytes_read < 3) {
375 FinishFileIndexing(true);
376 return;
379 size_t size = static_cast<size_t>(bytes_read);
380 vector<TrigramChar> trigram_chars;
381 trigram_chars.reserve(size);
382 for (size_t i = 0; i < size; ++i) {
383 if (IsBinaryChar(data[i])) {
384 current_trigrams_.clear();
385 FinishFileIndexing(true);
386 return;
388 trigram_chars.push_back(TrigramCharForChar(data[i]));
391 for (size_t i = 0; i + 2 < size; ++i) {
392 Trigram trigram = TrigramAtIndex(trigram_chars, i);
393 if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
394 current_trigrams_set_[trigram] = true;
395 current_trigrams_.push_back(trigram);
398 current_file_offset_ += bytes_read - 2;
399 ReadFromFile();
402 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
403 bool success) {
404 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
405 CloseFile();
406 if (success) {
407 FilePath file_path = indexing_it_->first;
408 g_trigram_index.Get().SetTrigramsForFile(
409 file_path, current_trigrams_, file_path_times_[file_path]);
411 ReportWorked();
412 ++indexing_it_;
413 IndexFiles();
416 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
417 if (current_file_.IsValid())
418 current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this));
421 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
422 base::File::Error error) {}
424 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
425 TimeTicks current_time = TimeTicks::Now();
426 bool should_send_worked_nitification = true;
427 if (!last_worked_notification_time_.is_null()) {
428 TimeDelta delta = current_time - last_worked_notification_time_;
429 if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
430 should_send_worked_nitification = false;
432 ++files_indexed_;
433 if (should_send_worked_nitification) {
434 last_worked_notification_time_ = current_time;
435 BrowserThread::PostTask(
436 BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
437 files_indexed_ = 0;
441 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
442 static bool maps_initialized = false;
443 if (!maps_initialized) {
444 InitIsBinaryCharMap();
445 InitTrigramCharsMap();
446 maps_initialized = true;
450 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
452 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
453 DevToolsFileSystemIndexer::IndexPath(
454 const string& file_system_path,
455 const TotalWorkCallback& total_work_callback,
456 const WorkedCallback& worked_callback,
457 const DoneCallback& done_callback) {
458 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
459 scoped_refptr<FileSystemIndexingJob> indexing_job =
460 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
461 total_work_callback,
462 worked_callback,
463 done_callback);
464 indexing_job->Start();
465 return indexing_job;
468 void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
469 const string& query,
470 const SearchCallback& callback) {
471 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
472 BrowserThread::PostTask(
473 BrowserThread::FILE,
474 FROM_HERE,
475 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
476 this,
477 file_system_path,
478 query,
479 callback));
482 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
483 const string& file_system_path,
484 const string& query,
485 const SearchCallback& callback) {
486 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
487 vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
488 vector<string> result;
489 FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
490 vector<FilePath>::const_iterator it = file_paths.begin();
491 for (; it != file_paths.end(); ++it) {
492 if (path.IsParent(*it))
493 result.push_back(it->AsUTF8Unsafe());
495 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));