1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
10 #include "base/callback.h"
11 #include "base/files/file_enumerator.h"
12 #include "base/files/file_util.h"
13 #include "base/files/file_util_proxy.h"
14 #include "base/lazy_instance.h"
15 #include "base/logging.h"
16 #include "base/stl_util.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "content/public/browser/browser_thread.h"
22 using base::FileEnumerator
;
25 using base::TimeDelta
;
26 using base::TimeTicks
;
27 using content::BrowserThread
;
35 typedef int32 Trigram
;
36 typedef char TrigramChar
;
37 typedef uint16 FileId
;
39 const int kMinTimeoutBetweenWorkedNitification
= 200;
40 // Trigram characters include all ASCII printable characters (32-126) except for
41 // the capital letters, because the index is case insensitive.
42 const size_t kTrigramCharacterCount
= 126 - 'Z' - 1 + 'A' - ' ' + 1;
43 const size_t kTrigramCount
=
44 kTrigramCharacterCount
* kTrigramCharacterCount
* kTrigramCharacterCount
;
45 const int kMaxReadLength
= 10 * 1024;
46 const TrigramChar kUndefinedTrigramChar
= -1;
47 const TrigramChar kBinaryTrigramChar
= -2;
48 const Trigram kUndefinedTrigram
= -1;
53 Time
LastModifiedTimeForFile(const FilePath
& file_path
);
54 void SetTrigramsForFile(const FilePath
& file_path
,
55 const vector
<Trigram
>& index
,
57 vector
<FilePath
> Search(string query
);
59 void NormalizeVectors();
64 FileId
GetFileId(const FilePath
& file_path
);
66 typedef map
<FilePath
, FileId
> FileIdsMap
;
69 // The index in this vector is the trigram id.
70 vector
<vector
<FileId
> > index_
;
71 typedef map
<FilePath
, Time
> IndexedFilesMap
;
72 IndexedFilesMap index_times_
;
73 vector
<bool> is_normalized_
;
75 DISALLOW_COPY_AND_ASSIGN(Index
);
78 base::LazyInstance
<Index
>::Leaky g_trigram_index
= LAZY_INSTANCE_INITIALIZER
;
80 TrigramChar
TrigramCharForChar(char c
) {
81 static TrigramChar
* trigram_chars
= nullptr;
83 trigram_chars
= new TrigramChar
[256];
84 for (size_t i
= 0; i
< 256; ++i
) {
86 trigram_chars
[i
] = kUndefinedTrigramChar
;
89 char ch
= static_cast<char>(i
);
92 if (ch
>= 'A' && ch
<= 'Z')
95 bool is_binary_char
= ch
< 9 || (ch
>= 14 && ch
< 32) || ch
== 127;
97 trigram_chars
[i
] = kBinaryTrigramChar
;
102 trigram_chars
[i
] = kUndefinedTrigramChar
;
107 ch
= ch
- 'Z' - 1 + 'A';
109 char signed_trigram_count
= static_cast<char>(kTrigramCharacterCount
);
110 CHECK(ch
>= 0 && ch
< signed_trigram_count
);
111 trigram_chars
[i
] = ch
;
114 unsigned char uc
= static_cast<unsigned char>(c
);
115 return trigram_chars
[uc
];
118 Trigram
TrigramAtIndex(const vector
<TrigramChar
>& trigram_chars
, size_t index
) {
119 static int kTrigramCharacterCountSquared
=
120 kTrigramCharacterCount
* kTrigramCharacterCount
;
121 if (trigram_chars
[index
] == kUndefinedTrigramChar
||
122 trigram_chars
[index
+ 1] == kUndefinedTrigramChar
||
123 trigram_chars
[index
+ 2] == kUndefinedTrigramChar
)
124 return kUndefinedTrigram
;
125 Trigram trigram
= kTrigramCharacterCountSquared
* trigram_chars
[index
] +
126 kTrigramCharacterCount
* trigram_chars
[index
+ 1] +
127 trigram_chars
[index
+ 2];
131 Index::Index() : last_file_id_(0) {
132 index_
.resize(kTrigramCount
);
133 is_normalized_
.resize(kTrigramCount
);
134 std::fill(is_normalized_
.begin(), is_normalized_
.end(), true);
139 Time
Index::LastModifiedTimeForFile(const FilePath
& file_path
) {
140 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
141 Time last_modified_time
;
142 if (index_times_
.find(file_path
) != index_times_
.end())
143 last_modified_time
= index_times_
[file_path
];
144 return last_modified_time
;
147 void Index::SetTrigramsForFile(const FilePath
& file_path
,
148 const vector
<Trigram
>& index
,
150 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
151 FileId file_id
= GetFileId(file_path
);
152 vector
<Trigram
>::const_iterator it
= index
.begin();
153 for (; it
!= index
.end(); ++it
) {
154 Trigram trigram
= *it
;
155 index_
[trigram
].push_back(file_id
);
156 is_normalized_
[trigram
] = false;
158 index_times_
[file_path
] = time
;
161 vector
<FilePath
> Index::Search(string query
) {
162 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
163 const char* data
= query
.c_str();
164 vector
<TrigramChar
> trigram_chars
;
165 trigram_chars
.reserve(query
.size());
166 for (size_t i
= 0; i
< query
.size(); ++i
) {
167 TrigramChar trigram_char
= TrigramCharForChar(data
[i
]);
168 if (trigram_char
== kBinaryTrigramChar
)
169 trigram_char
= kUndefinedTrigramChar
;
170 trigram_chars
.push_back(trigram_char
);
172 vector
<Trigram
> trigrams
;
173 for (size_t i
= 0; i
+ 2 < query
.size(); ++i
) {
174 Trigram trigram
= TrigramAtIndex(trigram_chars
, i
);
175 if (trigram
!= kUndefinedTrigram
)
176 trigrams
.push_back(trigram
);
178 set
<FileId
> file_ids
;
180 vector
<Trigram
>::const_iterator it
= trigrams
.begin();
181 for (; it
!= trigrams
.end(); ++it
) {
182 Trigram trigram
= *it
;
184 std::copy(index_
[trigram
].begin(),
185 index_
[trigram
].end(),
186 std::inserter(file_ids
, file_ids
.begin()));
190 set
<FileId
> intersection
= base::STLSetIntersection
<set
<FileId
> >(
191 file_ids
, index_
[trigram
]);
192 file_ids
.swap(intersection
);
194 vector
<FilePath
> result
;
195 FileIdsMap::const_iterator ids_it
= file_ids_
.begin();
196 for (; ids_it
!= file_ids_
.end(); ++ids_it
) {
197 if (trigrams
.size() == 0 ||
198 file_ids
.find(ids_it
->second
) != file_ids
.end()) {
199 result
.push_back(ids_it
->first
);
205 FileId
Index::GetFileId(const FilePath
& file_path
) {
206 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
207 string file_path_str
= file_path
.AsUTF8Unsafe();
208 if (file_ids_
.find(file_path
) != file_ids_
.end())
209 return file_ids_
[file_path
];
210 file_ids_
[file_path
] = ++last_file_id_
;
211 return last_file_id_
;
214 void Index::NormalizeVectors() {
215 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
216 for (size_t i
= 0; i
< kTrigramCount
; ++i
) {
217 if (!is_normalized_
[i
]) {
218 std::sort(index_
[i
].begin(), index_
[i
].end());
219 if (index_
[i
].capacity() > index_
[i
].size())
220 vector
<FileId
>(index_
[i
]).swap(index_
[i
]);
221 is_normalized_
[i
] = true;
226 void Index::PrintStats() {
227 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
228 LOG(ERROR
) << "Index stats:";
232 for (size_t i
= 0; i
< kTrigramCount
; ++i
) {
233 if (index_
[i
].size() > maxSize
)
234 maxSize
= index_
[i
].size();
235 size
+= index_
[i
].size();
236 capacity
+= index_
[i
].capacity();
238 LOG(ERROR
) << " - total trigram count: " << size
;
239 LOG(ERROR
) << " - max file count per trigram: " << maxSize
;
240 LOG(ERROR
) << " - total vectors capacity " << capacity
;
241 size_t total_index_size
=
242 capacity
* sizeof(FileId
) + sizeof(vector
<FileId
>) * kTrigramCount
;
243 LOG(ERROR
) << " - estimated total index size " << total_index_size
;
246 typedef Callback
<void(bool, const vector
<bool>&)> IndexerCallback
;
250 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
251 const FilePath
& file_system_path
,
252 const TotalWorkCallback
& total_work_callback
,
253 const WorkedCallback
& worked_callback
,
254 const DoneCallback
& done_callback
)
255 : file_system_path_(file_system_path
),
256 total_work_callback_(total_work_callback
),
257 worked_callback_(worked_callback
),
258 done_callback_(done_callback
),
259 current_file_(BrowserThread::GetMessageLoopProxyForThread(
260 BrowserThread::FILE).get()),
263 current_trigrams_set_
.resize(kTrigramCount
);
264 current_trigrams_
.reserve(kTrigramCount
);
267 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
269 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
270 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
271 BrowserThread::PostTask(
274 Bind(&FileSystemIndexingJob::CollectFilesToIndex
, this));
277 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
278 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
279 BrowserThread::PostTask(BrowserThread::FILE,
281 Bind(&FileSystemIndexingJob::StopOnFileThread
, this));
284 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
288 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
289 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
292 if (!file_enumerator_
) {
293 file_enumerator_
.reset(
294 new FileEnumerator(file_system_path_
, true, FileEnumerator::FILES
));
296 FilePath file_path
= file_enumerator_
->Next();
297 if (file_path
.empty()) {
298 BrowserThread::PostTask(
301 Bind(total_work_callback_
, file_path_times_
.size()));
302 indexing_it_
= file_path_times_
.begin();
306 Time saved_last_modified_time
=
307 g_trigram_index
.Get().LastModifiedTimeForFile(file_path
);
308 FileEnumerator::FileInfo file_info
= file_enumerator_
->GetInfo();
309 Time current_last_modified_time
= file_info
.GetLastModifiedTime();
310 if (current_last_modified_time
> saved_last_modified_time
) {
311 file_path_times_
[file_path
] = current_last_modified_time
;
313 BrowserThread::PostTask(
316 Bind(&FileSystemIndexingJob::CollectFilesToIndex
, this));
319 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
320 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
323 if (indexing_it_
== file_path_times_
.end()) {
324 g_trigram_index
.Get().NormalizeVectors();
325 BrowserThread::PostTask(BrowserThread::UI
, FROM_HERE
, done_callback_
);
328 FilePath file_path
= indexing_it_
->first
;
329 current_file_
.CreateOrOpen(
331 base::File::FLAG_OPEN
| base::File::FLAG_READ
,
332 Bind(&FileSystemIndexingJob::StartFileIndexing
, this));
335 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
336 base::File::Error error
) {
337 if (!current_file_
.IsValid()) {
338 FinishFileIndexing(false);
341 current_file_offset_
= 0;
342 current_trigrams_
.clear();
343 std::fill(current_trigrams_set_
.begin(), current_trigrams_set_
.end(), false);
347 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
352 current_file_
.Read(current_file_offset_
, kMaxReadLength
,
353 Bind(&FileSystemIndexingJob::OnRead
, this));
356 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
357 base::File::Error error
,
360 if (error
!= base::File::FILE_OK
) {
361 FinishFileIndexing(false);
365 if (!bytes_read
|| bytes_read
< 3) {
366 FinishFileIndexing(true);
370 size_t size
= static_cast<size_t>(bytes_read
);
371 vector
<TrigramChar
> trigram_chars
;
372 trigram_chars
.reserve(size
);
373 for (size_t i
= 0; i
< size
; ++i
) {
374 TrigramChar trigram_char
= TrigramCharForChar(data
[i
]);
375 if (trigram_char
== kBinaryTrigramChar
) {
376 current_trigrams_
.clear();
377 FinishFileIndexing(true);
380 trigram_chars
.push_back(trigram_char
);
383 for (size_t i
= 0; i
+ 2 < size
; ++i
) {
384 Trigram trigram
= TrigramAtIndex(trigram_chars
, i
);
385 if ((trigram
!= kUndefinedTrigram
) && !current_trigrams_set_
[trigram
]) {
386 current_trigrams_set_
[trigram
] = true;
387 current_trigrams_
.push_back(trigram
);
390 current_file_offset_
+= bytes_read
- 2;
394 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
396 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
399 FilePath file_path
= indexing_it_
->first
;
400 g_trigram_index
.Get().SetTrigramsForFile(
401 file_path
, current_trigrams_
, file_path_times_
[file_path
]);
408 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
409 if (current_file_
.IsValid())
410 current_file_
.Close(Bind(&FileSystemIndexingJob::CloseCallback
, this));
413 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
414 base::File::Error error
) {}
416 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
417 TimeTicks current_time
= TimeTicks::Now();
418 bool should_send_worked_nitification
= true;
419 if (!last_worked_notification_time_
.is_null()) {
420 TimeDelta delta
= current_time
- last_worked_notification_time_
;
421 if (delta
.InMilliseconds() < kMinTimeoutBetweenWorkedNitification
)
422 should_send_worked_nitification
= false;
425 if (should_send_worked_nitification
) {
426 last_worked_notification_time_
= current_time
;
427 BrowserThread::PostTask(
428 BrowserThread::UI
, FROM_HERE
, Bind(worked_callback_
, files_indexed_
));
433 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
436 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
438 scoped_refptr
<DevToolsFileSystemIndexer::FileSystemIndexingJob
>
439 DevToolsFileSystemIndexer::IndexPath(
440 const string
& file_system_path
,
441 const TotalWorkCallback
& total_work_callback
,
442 const WorkedCallback
& worked_callback
,
443 const DoneCallback
& done_callback
) {
444 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
445 scoped_refptr
<FileSystemIndexingJob
> indexing_job
=
446 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path
),
450 indexing_job
->Start();
454 void DevToolsFileSystemIndexer::SearchInPath(const string
& file_system_path
,
456 const SearchCallback
& callback
) {
457 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
458 BrowserThread::PostTask(
461 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread
,
468 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
469 const string
& file_system_path
,
471 const SearchCallback
& callback
) {
472 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
473 vector
<FilePath
> file_paths
= g_trigram_index
.Get().Search(query
);
474 vector
<string
> result
;
475 FilePath path
= FilePath::FromUTF8Unsafe(file_system_path
);
476 vector
<FilePath
>::const_iterator it
= file_paths
.begin();
477 for (; it
!= file_paths
.end(); ++it
) {
478 if (path
.IsParent(*it
))
479 result
.push_back(it
->AsUTF8Unsafe());
481 BrowserThread::PostTask(BrowserThread::UI
, FROM_HERE
, Bind(callback
, result
));