1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
10 #include "base/callback.h"
11 #include "base/file_util.h"
12 #include "base/files/file_enumerator.h"
13 #include "base/files/file_util_proxy.h"
14 #include "base/lazy_instance.h"
15 #include "base/logging.h"
16 #include "base/platform_file.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "content/public/browser/browser_thread.h"
22 using base::FileEnumerator
;
24 using base::FileUtilProxy
;
26 using base::TimeDelta
;
27 using base::TimeTicks
;
28 using base::PassPlatformFile
;
29 using base::PlatformFile
;
30 using base::PlatformFileError
;
31 using content::BrowserThread
;
39 typedef int32 Trigram
;
40 typedef char TrigramChar
;
41 typedef uint16 FileId
;
43 const int kMinTimeoutBetweenWorkedNitification
= 200;
44 // Trigram characters include all ASCII printable characters (32-126) except for
45 // the capital letters, because the index is case insensitive.
46 const size_t kTrigramCharacterCount
= 126 - 'Z' - 1 + 'A' - ' ' + 1;
47 const size_t kTrigramCount
=
48 kTrigramCharacterCount
* kTrigramCharacterCount
* kTrigramCharacterCount
;
49 const int kMaxReadLength
= 10 * 1024;
50 const TrigramChar kUndefinedTrigramChar
= -1;
51 const Trigram kUndefinedTrigram
= -1;
53 base::LazyInstance
<vector
<bool> >::Leaky g_is_binary_char
=
54 LAZY_INSTANCE_INITIALIZER
;
56 base::LazyInstance
<vector
<TrigramChar
> >::Leaky g_trigram_chars
=
57 LAZY_INSTANCE_INITIALIZER
;
62 Time
LastModifiedTimeForFile(const FilePath
& file_path
);
63 void SetTrigramsForFile(const FilePath
& file_path
,
64 const vector
<Trigram
>& index
,
66 vector
<FilePath
> Search(string query
);
68 void NormalizeVectors();
73 FileId
GetFileId(const FilePath
& file_path
);
75 typedef map
<FilePath
, FileId
> FileIdsMap
;
78 // The index in this vector is the trigram id.
79 vector
<vector
<FileId
> > index_
;
80 typedef map
<FilePath
, Time
> IndexedFilesMap
;
81 IndexedFilesMap index_times_
;
82 vector
<bool> is_normalized_
;
84 DISALLOW_COPY_AND_ASSIGN(Index
);
87 base::LazyInstance
<Index
>::Leaky g_trigram_index
= LAZY_INSTANCE_INITIALIZER
;
89 void InitIsBinaryCharMap() {
90 for (size_t i
= 0; i
< 256; ++i
) {
92 bool is_binary_char
= ch
< 9 || (ch
>= 14 && ch
< 32) || ch
== 127;
93 g_is_binary_char
.Get().push_back(is_binary_char
);
97 bool IsBinaryChar(char c
) {
98 unsigned char uc
= static_cast<unsigned char>(c
);
99 return g_is_binary_char
.Get()[uc
];
102 void InitTrigramCharsMap() {
103 for (size_t i
= 0; i
< 256; ++i
) {
105 g_trigram_chars
.Get().push_back(kUndefinedTrigramChar
);
111 if (ch
>= 'A' && ch
<= 'Z')
113 if ((IsBinaryChar(ch
)) || (ch
< ' ')) {
114 g_trigram_chars
.Get().push_back(kUndefinedTrigramChar
);
119 ch
= ch
- 'Z' - 1 + 'A';
121 char signed_trigram_char_count
= static_cast<char>(kTrigramCharacterCount
);
122 CHECK(ch
>= 0 && ch
< signed_trigram_char_count
);
123 g_trigram_chars
.Get().push_back(ch
);
127 TrigramChar
TrigramCharForChar(char c
) {
128 unsigned char uc
= static_cast<unsigned char>(c
);
129 return g_trigram_chars
.Get()[uc
];
132 Trigram
TrigramAtIndex(const vector
<TrigramChar
>& trigram_chars
, size_t index
) {
133 static int kTrigramCharacterCountSquared
=
134 kTrigramCharacterCount
* kTrigramCharacterCount
;
135 if (trigram_chars
[index
] == kUndefinedTrigramChar
||
136 trigram_chars
[index
+ 1] == kUndefinedTrigramChar
||
137 trigram_chars
[index
+ 2] == kUndefinedTrigramChar
)
138 return kUndefinedTrigram
;
139 Trigram trigram
= kTrigramCharacterCountSquared
* trigram_chars
[index
] +
140 kTrigramCharacterCount
* trigram_chars
[index
+ 1] +
141 trigram_chars
[index
+ 2];
145 Index::Index() : last_file_id_(0) {
146 index_
.resize(kTrigramCount
);
147 is_normalized_
.resize(kTrigramCount
);
148 std::fill(is_normalized_
.begin(), is_normalized_
.end(), true);
153 Time
Index::LastModifiedTimeForFile(const FilePath
& file_path
) {
154 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
155 Time last_modified_time
;
156 if (index_times_
.find(file_path
) != index_times_
.end())
157 last_modified_time
= index_times_
[file_path
];
158 return last_modified_time
;
161 void Index::SetTrigramsForFile(const FilePath
& file_path
,
162 const vector
<Trigram
>& index
,
164 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
165 FileId file_id
= GetFileId(file_path
);
166 vector
<Trigram
>::const_iterator it
= index
.begin();
167 for (; it
!= index
.end(); ++it
) {
168 Trigram trigram
= *it
;
169 index_
[trigram
].push_back(file_id
);
170 is_normalized_
[trigram
] = false;
172 index_times_
[file_path
] = time
;
175 vector
<FilePath
> Index::Search(string query
) {
176 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
177 const char* data
= query
.c_str();
178 vector
<TrigramChar
> trigram_chars
;
179 trigram_chars
.reserve(query
.size());
180 for (size_t i
= 0; i
< query
.size(); ++i
)
181 trigram_chars
.push_back(TrigramCharForChar(data
[i
]));
182 vector
<Trigram
> trigrams
;
183 for (size_t i
= 0; i
+ 2 < query
.size(); ++i
) {
184 Trigram trigram
= TrigramAtIndex(trigram_chars
, i
);
185 if (trigram
!= kUndefinedTrigram
)
186 trigrams
.push_back(trigram
);
188 set
<FileId
> file_ids
;
190 vector
<Trigram
>::const_iterator it
= trigrams
.begin();
191 for (; it
!= trigrams
.end(); ++it
) {
192 Trigram trigram
= *it
;
194 std::copy(index_
[trigram
].begin(),
195 index_
[trigram
].end(),
196 std::inserter(file_ids
, file_ids
.begin()));
200 set
<FileId
> intersection
;
201 std::set_intersection(file_ids
.begin(),
203 index_
[trigram
].begin(),
204 index_
[trigram
].end(),
205 std::inserter(intersection
, intersection
.begin()));
206 file_ids
.swap(intersection
);
208 vector
<FilePath
> result
;
209 FileIdsMap::const_iterator ids_it
= file_ids_
.begin();
210 for (; ids_it
!= file_ids_
.end(); ++ids_it
) {
211 if (trigrams
.size() == 0 ||
212 file_ids
.find(ids_it
->second
) != file_ids
.end()) {
213 result
.push_back(ids_it
->first
);
219 FileId
Index::GetFileId(const FilePath
& file_path
) {
220 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
221 string file_path_str
= file_path
.AsUTF8Unsafe();
222 if (file_ids_
.find(file_path
) != file_ids_
.end())
223 return file_ids_
[file_path
];
224 file_ids_
[file_path
] = ++last_file_id_
;
225 return last_file_id_
;
228 void Index::NormalizeVectors() {
229 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
230 for (size_t i
= 0; i
< kTrigramCount
; ++i
) {
231 if (!is_normalized_
[i
]) {
232 std::sort(index_
[i
].begin(), index_
[i
].end());
233 if (index_
[i
].capacity() > index_
[i
].size())
234 vector
<FileId
>(index_
[i
]).swap(index_
[i
]);
235 is_normalized_
[i
] = true;
240 void Index::PrintStats() {
241 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
242 LOG(ERROR
) << "Index stats:";
246 for (size_t i
= 0; i
< kTrigramCount
; ++i
) {
247 if (index_
[i
].size() > maxSize
)
248 maxSize
= index_
[i
].size();
249 size
+= index_
[i
].size();
250 capacity
+= index_
[i
].capacity();
252 LOG(ERROR
) << " - total trigram count: " << size
;
253 LOG(ERROR
) << " - max file count per trigram: " << maxSize
;
254 LOG(ERROR
) << " - total vectors capacity " << capacity
;
255 size_t total_index_size
=
256 capacity
* sizeof(FileId
) + sizeof(vector
<FileId
>) * kTrigramCount
;
257 LOG(ERROR
) << " - estimated total index size " << total_index_size
;
260 typedef Callback
<void(bool, const vector
<bool>&)> IndexerCallback
;
264 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
265 const FilePath
& file_system_path
,
266 const TotalWorkCallback
& total_work_callback
,
267 const WorkedCallback
& worked_callback
,
268 const DoneCallback
& done_callback
)
269 : file_system_path_(file_system_path
),
270 total_work_callback_(total_work_callback
),
271 worked_callback_(worked_callback
),
272 done_callback_(done_callback
),
275 current_trigrams_set_
.resize(kTrigramCount
);
276 current_trigrams_
.reserve(kTrigramCount
);
279 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
281 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
282 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
283 BrowserThread::PostTask(
286 Bind(&FileSystemIndexingJob::CollectFilesToIndex
, this));
289 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
290 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
291 BrowserThread::PostTask(BrowserThread::FILE,
293 Bind(&FileSystemIndexingJob::StopOnFileThread
, this));
296 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
300 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
301 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
304 if (!file_enumerator_
) {
305 file_enumerator_
.reset(
306 new FileEnumerator(file_system_path_
, true, FileEnumerator::FILES
));
308 FilePath file_path
= file_enumerator_
->Next();
309 if (file_path
.empty()) {
310 BrowserThread::PostTask(
313 Bind(total_work_callback_
, file_path_times_
.size()));
314 indexing_it_
= file_path_times_
.begin();
318 Time saved_last_modified_time
=
319 g_trigram_index
.Get().LastModifiedTimeForFile(file_path
);
320 FileEnumerator::FileInfo file_info
= file_enumerator_
->GetInfo();
321 Time current_last_modified_time
= file_info
.GetLastModifiedTime();
322 if (current_last_modified_time
> saved_last_modified_time
) {
323 file_path_times_
[file_path
] = current_last_modified_time
;
325 BrowserThread::PostTask(
328 Bind(&FileSystemIndexingJob::CollectFilesToIndex
, this));
331 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
332 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
335 if (indexing_it_
== file_path_times_
.end()) {
336 g_trigram_index
.Get().NormalizeVectors();
337 BrowserThread::PostTask(BrowserThread::UI
, FROM_HERE
, done_callback_
);
340 FilePath file_path
= indexing_it_
->first
;
341 FileUtilProxy::CreateOrOpen(
342 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
344 base::PLATFORM_FILE_OPEN
| base::PLATFORM_FILE_READ
,
345 Bind(&FileSystemIndexingJob::StartFileIndexing
, this));
348 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
349 PlatformFileError error
,
350 PassPlatformFile pass_file
,
352 if (error
!= base::PLATFORM_FILE_OK
) {
353 current_file_
= base::kInvalidPlatformFileValue
;
354 FinishFileIndexing(false);
357 current_file_
= pass_file
.ReleaseValue();
358 current_file_offset_
= 0;
359 current_trigrams_
.clear();
360 std::fill(current_trigrams_set_
.begin(), current_trigrams_set_
.end(), false);
364 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
370 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
372 current_file_offset_
,
374 Bind(&FileSystemIndexingJob::OnRead
, this));
377 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
378 PlatformFileError error
,
381 if (error
!= base::PLATFORM_FILE_OK
) {
382 FinishFileIndexing(false);
386 if (!bytes_read
|| bytes_read
< 3) {
387 FinishFileIndexing(true);
391 size_t size
= static_cast<size_t>(bytes_read
);
392 vector
<TrigramChar
> trigram_chars
;
393 trigram_chars
.reserve(size
);
394 for (size_t i
= 0; i
< size
; ++i
) {
395 if (IsBinaryChar(data
[i
])) {
396 current_trigrams_
.clear();
397 FinishFileIndexing(true);
400 trigram_chars
.push_back(TrigramCharForChar(data
[i
]));
403 for (size_t i
= 0; i
+ 2 < size
; ++i
) {
404 Trigram trigram
= TrigramAtIndex(trigram_chars
, i
);
405 if ((trigram
!= kUndefinedTrigram
) && !current_trigrams_set_
[trigram
]) {
406 current_trigrams_set_
[trigram
] = true;
407 current_trigrams_
.push_back(trigram
);
410 current_file_offset_
+= bytes_read
- 2;
414 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
416 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
419 FilePath file_path
= indexing_it_
->first
;
420 g_trigram_index
.Get().SetTrigramsForFile(
421 file_path
, current_trigrams_
, file_path_times_
[file_path
]);
428 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
429 if (current_file_
!= base::kInvalidPlatformFileValue
) {
430 FileUtilProxy::Close(
431 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
433 Bind(&FileSystemIndexingJob::CloseCallback
, this));
437 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
438 PlatformFileError error
) {}
440 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
441 TimeTicks current_time
= TimeTicks::Now();
442 bool should_send_worked_nitification
= true;
443 if (!last_worked_notification_time_
.is_null()) {
444 TimeDelta delta
= current_time
- last_worked_notification_time_
;
445 if (delta
.InMilliseconds() < kMinTimeoutBetweenWorkedNitification
)
446 should_send_worked_nitification
= false;
449 if (should_send_worked_nitification
) {
450 last_worked_notification_time_
= current_time
;
451 BrowserThread::PostTask(
452 BrowserThread::UI
, FROM_HERE
, Bind(worked_callback_
, files_indexed_
));
457 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
458 static bool maps_initialized
= false;
459 if (!maps_initialized
) {
460 InitIsBinaryCharMap();
461 InitTrigramCharsMap();
462 maps_initialized
= true;
466 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
468 scoped_refptr
<DevToolsFileSystemIndexer::FileSystemIndexingJob
>
469 DevToolsFileSystemIndexer::IndexPath(
470 const string
& file_system_path
,
471 const TotalWorkCallback
& total_work_callback
,
472 const WorkedCallback
& worked_callback
,
473 const DoneCallback
& done_callback
) {
474 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
475 scoped_refptr
<FileSystemIndexingJob
> indexing_job
=
476 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path
),
480 indexing_job
->Start();
484 void DevToolsFileSystemIndexer::SearchInPath(const string
& file_system_path
,
486 const SearchCallback
& callback
) {
487 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
488 BrowserThread::PostTask(
491 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread
,
498 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
499 const string
& file_system_path
,
501 const SearchCallback
& callback
) {
502 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
503 vector
<FilePath
> file_paths
= g_trigram_index
.Get().Search(query
);
504 vector
<string
> result
;
505 FilePath path
= FilePath::FromUTF8Unsafe(file_system_path
);
506 vector
<FilePath
>::const_iterator it
= file_paths
.begin();
507 for (; it
!= file_paths
.end(); ++it
) {
508 if (path
.IsParent(*it
))
509 result
.push_back(it
->AsUTF8Unsafe());
511 BrowserThread::PostTask(BrowserThread::UI
, FROM_HERE
, Bind(callback
, result
));