4 // Copyright (C) 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 using System
.Collections
;
29 using System
.Diagnostics
;
32 using System
.Threading
;
35 using System
.Xml
.Serialization
;
39 using FSQ
= Beagle
.Daemon
.FileSystemQueryable
.FileSystemQueryable
;
41 namespace Beagle
.Daemon
45 static string [] argv
;
47 static bool arg_recursive
= false, arg_debug
= false, arg_cache_text
= false, arg_disable_filtering
= false, arg_disable_restart
= false;
49 static Hashtable remap_table
= new Hashtable ();
51 static string arg_output
, arg_tag
, arg_source
;
53 /////////////////////////////////////////////////////////
55 // Files and directories that are allowed to be in the target
56 // directory before we blow it away. If we encounter any file
57 // or dir not in this list, we'll bail out.
58 static string [] allowed_files
= {
59 "FileAttributesStore.db",
64 static string [] allowed_dirs
= {
71 /////////////////////////////////////////////////////////
73 static FileAttributesStore_Sqlite backing_fa_store
;
74 static FileAttributesStore fa_store
;
76 static LuceneIndexingDriver driver
;
78 static bool crawling
= true, indexing
= true, shutdown
= false, restart
= false;
80 static ArrayList allowed_patterns
= new ArrayList ();
81 static ArrayList denied_patterns
= new ArrayList ();
83 static Queue pending_files
= new Queue ();
84 static Queue pending_directories
= new Queue ();
86 const int BATCH_SIZE
= 30;
88 /////////////////////////////////////////////////////////
90 static void Main (string [] args
)
94 } catch (Exception ex
) {
95 Logger
.Log
.Error ("Unhandled exception thrown. Exiting immediately.");
96 Logger
.Log
.Error (ex
);
101 static void DoMain (string [] args
)
103 SystemInformation
.SetProcessName ("beagle-build-index");
109 while (i
< args
.Length
) {
111 string arg
= args
[i
];
113 string next_arg
= i
< args
.Length
? args
[i
] : null;
122 if (next_arg
!= null)
129 arg_recursive
= true;
132 case "--enable-text-cache":
133 arg_cache_text
= true;
137 if (next_arg
== null)
140 int j
= next_arg
.IndexOf (":");
143 Logger
.Log
.Error ("Invalid remap argument: {0}", next_arg
);
144 Environment
.Exit (1);
147 remap_table
[next_arg
.Substring (0, j
)] = next_arg
.Substring (j
+1);
153 if (next_arg
!= null)
154 arg_output
= Path
.IsPathRooted (next_arg
) ? next_arg
: Path
.GetFullPath (next_arg
);
158 case "--disable-filtering":
159 arg_disable_filtering
= true;
162 case "--allow-pattern":
163 if (next_arg
== null)
166 if (next_arg
.IndexOf (',') != -1) {
167 foreach (string pattern
in next_arg
.Split (','))
168 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
171 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
177 case "--deny-pattern":
178 if (next_arg
== null)
181 if (next_arg
.IndexOf (',') != -1) {
182 foreach (string pattern
in next_arg
.Split (','))
183 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
186 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
192 case "--disable-restart":
193 arg_disable_restart
= true;
197 if (next_arg
== null)
200 arg_source
= next_arg
;
205 string path
= Path
.IsPathRooted (arg
) ? arg
: Path
.GetFullPath (arg
);
207 if (Directory
.Exists (path
))
208 pending_directories
.Enqueue (new DirectoryInfo (path
));
209 else if (File
.Exists (path
))
210 pending_files
.Enqueue (new FileInfo (path
));
217 /////////////////////////////////////////////////////////
219 if (arg_output
== null) {
220 Logger
.Log
.Error ("--target must be specified");
221 Environment
.Exit (1);
224 foreach (FileSystemInfo info
in pending_directories
) {
225 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
226 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
227 Environment
.Exit (1);
231 foreach (FileSystemInfo info
in pending_files
) {
232 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
233 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
234 Environment
.Exit (1);
238 if (!Directory
.Exists (Path
.GetDirectoryName (arg_output
))) {
239 Logger
.Log
.Error ("Index directory not available for construction: {0}", arg_output
);
240 Environment
.Exit (1);
243 // Be *EXTRA PARANOID* about the contents of the target
244 // directory, because creating an indexing driver will
246 if (Directory
.Exists (arg_output
)) {
248 foreach (FileInfo info
in DirectoryWalker
.GetFileInfos (arg_output
)) {
249 if (Array
.IndexOf (allowed_files
, info
.Name
) == -1) {
250 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle file {1} was found", arg_output
, info
.FullName
);
251 Environment
.Exit (1);
255 foreach (DirectoryInfo info
in DirectoryWalker
.GetDirectoryInfos (arg_output
)) {
256 if (Array
.IndexOf (allowed_dirs
, info
.Name
) == -1) {
257 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle directory {1} was found", arg_output
, info
.FullName
);
258 Environment
.Exit (1);
263 // Set the IO priority to idle so we don't slow down the system
264 IoPriority
.SetIdle ();
266 driver
= new LuceneIndexingDriver (arg_output
, false);
267 driver
.TextCache
= (arg_cache_text
) ? new TextCache (arg_output
) : null;
268 if (driver
.TextCache
!= null)
269 driver
.TextCache
.WorldReadable
= true;
271 backing_fa_store
= new FileAttributesStore_Sqlite (driver
.TopDirectory
, driver
.Fingerprint
);
272 fa_store
= new FileAttributesStore (backing_fa_store
);
274 // Set up signal handlers
275 SetupSignalHandlers ();
277 Thread crawl_thread
, index_thread
, monitor_thread
= null;
279 // Start the thread that does the crawling
280 crawl_thread
= ExceptionHandlingThread
.Start (new ThreadStart (CrawlWorker
));
282 // Start the thread that does the actual indexing
283 index_thread
= ExceptionHandlingThread
.Start (new ThreadStart (IndexWorker
));
285 if (!arg_disable_restart
) {
286 // Start the thread that monitors memory usage.
287 monitor_thread
= ExceptionHandlingThread
.Start (new ThreadStart (MemoryMonitorWorker
));
290 // Join all the threads so that we know that we're the only thread still running
291 crawl_thread
.Join ();
292 index_thread
.Join ();
293 if (monitor_thread
!= null)
294 monitor_thread
.Join ();
297 Logger
.Log
.Debug ("Restarting helper");
298 Process p
= new Process ();
299 p
.StartInfo
.UseShellExecute
= false;
300 // FIXME: Maybe this isn't the right way to do things? It should be ok,
301 // the PATH is inherited from the shell script which runs mono itself.
302 p
.StartInfo
.FileName
= "mono";
303 p
.StartInfo
.Arguments
= String
.Join (" ", Environment
.GetCommandLineArgs ());
308 /////////////////////////////////////////////////////////////////
310 static void CrawlWorker ()
312 Logger
.Log
.Debug ("Starting CrawlWorker");
317 while (pending_directories
.Count
> 0) {
318 DirectoryInfo dir
= (DirectoryInfo
) pending_directories
.Dequeue ();
322 foreach (DirectoryInfo subdir
in DirectoryWalker
.GetDirectoryInfos (dir
))
324 && !FileSystem
.IsSymLink (subdir
.FullName
))
325 pending_directories
.Enqueue (subdir
);
327 foreach (FileInfo file
in DirectoryWalker
.GetFileInfos (dir
))
329 && !FileSystem
.IsSymLink (file
.FullName
))
330 pending_files
.Enqueue (file
);
332 } catch (DirectoryNotFoundException e
) {}
340 Logger
.Log
.Debug ("Scanned {0} files in {1} directories", pending_files
.Count
, count_dirs
);
342 Logger
.Log
.Debug ("CrawlWorker Done");
348 /////////////////////////////////////////////////////////////////
350 static void AddToRequest (IndexerRequest request
, Indexable indexable
)
352 // Disable filtering and only index file attributes
353 if (arg_disable_filtering
)
354 indexable
.Filtering
= IndexableFiltering
.Never
;
356 // Tag the item for easy identification (for say, removal)
358 indexable
.AddProperty (Property
.NewUnsearched("Tag", arg_tag
));
360 if (arg_source
== null) {
361 DirectoryInfo dir
= new DirectoryInfo (StringFu
.SanitizePath (arg_output
));
362 arg_source
= dir
.Name
;
365 indexable
.Source
= arg_source
;
367 request
.Add (indexable
);
370 static IndexerReceipt
[] FlushIndexer (IIndexer indexer
, IndexerRequest request
)
372 IndexerReceipt
[] receipts
;
373 receipts
= indexer
.Flush (request
);
375 ArrayList pending_children
;
376 pending_children
= new ArrayList ();
378 foreach (IndexerReceipt raw_r
in receipts
) {
380 if (raw_r
is IndexerAddedReceipt
) {
381 // Update the file attributes
382 IndexerAddedReceipt r
= (IndexerAddedReceipt
) raw_r
;
384 Indexable indexable
= request
.GetByUri (r
.Uri
);
386 // We don't need to write out any file attributes for
388 if (indexable
.ParentUri
!= null)
391 string path
= r
.Uri
.LocalPath
;
394 attr
= fa_store
.ReadOrCreate (path
);
396 attr
.LastWriteTime
= indexable
.Timestamp
;
397 attr
.FilterName
= r
.FilterName
;
398 attr
.FilterVersion
= r
.FilterVersion
;
400 fa_store
.Write (attr
);
402 } else if (raw_r
is IndexerChildIndexablesReceipt
) {
403 // Add any child indexables back into our indexer
404 IndexerChildIndexablesReceipt r
= (IndexerChildIndexablesReceipt
) raw_r
;
405 pending_children
.AddRange (r
.Children
);
409 request
.Clear (); // clear out the old request
410 foreach (Indexable i
in pending_children
) // and then add the children
411 AddToRequest (request
, i
);
416 static void IndexWorker ()
418 Logger
.Log
.Debug ("Starting IndexWorker");
422 IndexerRequest pending_request
;
423 pending_request
= new IndexerRequest ();
426 if (pending_files
.Count
> 0) {
427 FileInfo file
= (FileInfo
) pending_files
.Dequeue ();
428 Uri uri
= UriFu
.PathToFileUri (file
.FullName
);
430 // Check that we really should be indexing the file
431 if (!file
.Exists
|| Ignore (file
) || fa_store
.IsUpToDate (file
.FullName
))
434 // Create the indexable and add the standard properties we
435 // use in the FileSystemQueryable.
436 indexable
= new Indexable (uri
);
437 indexable
.Timestamp
= file
.LastWriteTimeUtc
;
438 FSQ
.AddStandardPropertiesToIndexable (indexable
, file
.Name
, Guid
.Empty
, false);
440 AddToRequest (pending_request
, indexable
);
442 if (pending_request
.Count
>= BATCH_SIZE
) {
443 Logger
.Log
.Debug ("Flushing driver, {0} items in queue", pending_request
.Count
);
444 FlushIndexer (driver
, pending_request
);
445 // FlushIndexer clears the pending_request
448 } else if (crawling
) {
449 //Logger.Log.Debug ("IndexWorker: La la la...");
456 // Call Flush until our request is empty. We have to do this in a loop
457 // because children can get added back to the pending request in a flush.
458 while (pending_request
.Count
> 0)
459 FlushIndexer (driver
, pending_request
);
461 backing_fa_store
.Flush ();
463 driver
.OptimizeNow ();
465 Logger
.Log
.Debug ("IndexWorker Done");
471 /////////////////////////////////////////////////////////////////
473 static void MemoryMonitorWorker ()
475 int vmrss_original
= SystemInformation
.VmRss
;
477 const double threshold
= 5.0;
480 while (! shutdown
&& (crawling
|| indexing
)) {
482 // Check resident memory usage
483 int vmrss
= SystemInformation
.VmRss
;
484 double size
= vmrss
/ (double) vmrss_original
;
485 if (vmrss
!= last_vmrss
)
486 Logger
.Log
.Debug ("Size: VmRSS={0:0.0} MB, size={1:0.00}, {2:0.0}%",
487 vmrss
/1024.0, size
, 100.0 * (size
- 1) / (threshold
- 1));
489 if (size
> threshold
) {
490 Logger
.Log
.Debug ("Process too big, shutting down!");
500 /////////////////////////////////////////////////////////////////
502 // From BeagleDaemon.cs
504 static void SetupSignalHandlers ()
506 // Force OurSignalHandler to be JITed
507 OurSignalHandler (-1);
509 // Set up our signal handler
510 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGINT
, OurSignalHandler
);
511 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGTERM
, OurSignalHandler
);
512 if (Environment
.GetEnvironmentVariable("BEAGLE_THERE_BE_NO_QUITTIN") == null)
513 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGQUIT
, OurSignalHandler
);
516 static void OurSignalHandler (int signal
)
518 // This allows us to call OurSignalHandler w/o doing anything.
519 // We want to call it once to ensure that it is pre-JITed.
523 Logger
.Log
.Debug ("Shutdown Requested");
527 /////////////////////////////////////////////////////////////////
529 static void PrintUsage ()
532 "beagle-build-index: Build an index.\n" +
533 "Web page: http://www.gnome.org/projects/beagle\n" +
534 "Copyright (C) 2005-2006 Novell, Inc.\n\n";
537 "Usage: beagle-build-index [OPTIONS] --target <index_path> <path> [path]\n\n" +
540 "beagle-build-index will *delete all existing data* within the target\n" +
541 "directory. Ensure that the target path is set correctly before running.\n\n" +
544 " --source [name]\t\tThe index's source name. Defaults to the target directory name\n" +
545 " --remap [path1:path2]\t\tRemap data paths to fit target. \n" +
546 " --tag [tag]\t\t\tTag index data for identification.\n" +
547 " --recursive\t\t\tCrawl source path recursivly.\n" +
548 " --enable-text-cache\t\tBuild text-cache of documents used for snippets.\n" +
549 " --disable-filtering\t\tDisable all filtering of files. Only index attributes.\n" +
550 " --allow-pattern [pattern]\tOnly allow files that match the pattern to be indexed.\n" +
551 " --deny-pattern [pattern]\tKeep any files that match the pattern from being indexed.\n" +
552 " --disable-restart\t\tDon't restart when memory usage gets above a certain threshold.\n" +
553 " --debug\t\t\tEcho verbose debugging information.\n\n";
556 Console
.WriteLine (usage
);
557 Environment
.Exit (0);
560 /////////////////////////////////////////////////////////
562 static Uri
RemapUri (Uri uri
)
564 // FIXME: This is ghetto
565 foreach (DictionaryEntry dict
in remap_table
) {
566 if (uri
.LocalPath
.IndexOf ((string) dict
.Key
) == -1)
568 return new Uri (uri
.LocalPath
.Replace ((string) dict
.Key
, (string) dict
.Value
));
573 static bool Ignore (DirectoryInfo directory
)
575 if (directory
.Name
.StartsWith ("."))
581 static bool Ignore (FileInfo file
)
583 if (file
.Name
.StartsWith ("."))
586 if (FileSystem
.IsSymLink (file
.FullName
))
589 if (allowed_patterns
.Count
> 0) {
590 foreach (ExcludeItem pattern
in allowed_patterns
)
591 if (pattern
.IsMatch (file
.Name
))
597 foreach (ExcludeItem pattern
in denied_patterns
)
598 if (pattern
.IsMatch (file
.Name
))
601 // FIXME: Add more stuff here