4 // Copyright (C) 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 using System
.Collections
;
29 using System
.Diagnostics
;
32 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Documents
;
38 using Lucene
.Net
.Index
;
39 using LNS
= Lucene
.Net
.Search
;
43 using FSQ
= Beagle
.Daemon
.FileSystemQueryable
.FileSystemQueryable
;
45 namespace Beagle
.Daemon
49 static string [] argv
;
51 static bool arg_recursive
= false, arg_delete
= false, arg_debug
= false, arg_cache_text
= false, arg_disable_filtering
= false, arg_disable_restart
= false, arg_disable_directories
= false;
53 static Hashtable remap_table
= new Hashtable ();
55 static string arg_output
, arg_tag
, arg_source
;
57 /////////////////////////////////////////////////////////
59 // Files and directories that are allowed to be in the target
60 // directory before we blow it away. If we encounter any file
61 // or dir not in this list, we'll bail out.
62 static string [] allowed_files
= {
63 "FileAttributesStore.db",
68 static string [] allowed_dirs
= {
75 /////////////////////////////////////////////////////////
77 static FileAttributesStore_Sqlite backing_fa_store
;
78 static FileAttributesStore fa_store
;
80 static LuceneIndexingDriver driver
;
82 static bool crawling
= true, indexing
= true, shutdown
= false, restart
= false;
84 static ArrayList allowed_patterns
= new ArrayList ();
85 static ArrayList denied_patterns
= new ArrayList ();
87 static Queue pending_files
= new Queue ();
88 static Queue pending_directories
= new Queue ();
90 const int BATCH_SIZE
= 30;
92 /////////////////////////////////////////////////////////
94 static void Main (string [] args
)
98 } catch (Exception ex
) {
99 Logger
.Log
.Error (ex
, "Unhandled exception thrown. Exiting immediately.");
100 Environment
.Exit (1);
104 static void DoMain (string [] args
)
106 SystemInformation
.SetProcessName ("beagle-build-index");
112 while (i
< args
.Length
) {
114 string arg
= args
[i
];
116 string next_arg
= i
< args
.Length
? args
[i
] : null;
125 if (next_arg
!= null)
132 arg_recursive
= true;
135 case "--enable-deletion":
139 case "--disable-directories":
140 arg_disable_directories
= true;
143 case "--enable-text-cache":
144 arg_cache_text
= true;
149 if (next_arg == null)
152 int j = next_arg.IndexOf (":");
155 Logger.Log.Error ("Invalid remap argument: {0}", next_arg);
156 Environment.Exit (1);
159 remap_table [next_arg.Substring (0, j)] = next_arg.Substring (j+1);
165 if (next_arg
!= null)
166 arg_output
= Path
.IsPathRooted (next_arg
) ? next_arg
: Path
.GetFullPath (next_arg
);
170 case "--disable-filtering":
171 arg_disable_filtering
= true;
174 case "--allow-pattern":
175 if (next_arg
== null)
178 if (next_arg
.IndexOf (',') != -1) {
179 foreach (string pattern
in next_arg
.Split (','))
180 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
183 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
189 case "--deny-pattern":
190 if (next_arg
== null)
193 if (next_arg
.IndexOf (',') != -1) {
194 foreach (string pattern
in next_arg
.Split (','))
195 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
198 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
204 case "--disable-restart":
205 arg_disable_restart
= true;
209 if (next_arg
== null)
212 arg_source
= next_arg
;
217 string path
= Path
.IsPathRooted (arg
) ? arg
: Path
.GetFullPath (arg
);
218 if (path
!= "/" && path
.EndsWith ("/"))
219 path
= path
.TrimEnd ('/');
221 if (Directory
.Exists (path
))
222 pending_directories
.Enqueue (new DirectoryInfo (path
));
223 else if (File
.Exists (path
))
224 pending_files
.Enqueue (new FileInfo (path
));
231 /////////////////////////////////////////////////////////
233 if (arg_output
== null) {
234 Logger
.Log
.Error ("--target must be specified");
235 Environment
.Exit (1);
238 foreach (FileSystemInfo info
in pending_directories
) {
239 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
240 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
241 Environment
.Exit (1);
245 foreach (FileSystemInfo info
in pending_files
) {
246 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
247 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
248 Environment
.Exit (1);
252 if (!Directory
.Exists (Path
.GetDirectoryName (arg_output
))) {
253 Logger
.Log
.Error ("Index directory not available for construction: {0}", arg_output
);
254 Environment
.Exit (1);
257 // Be *EXTRA PARANOID* about the contents of the target
258 // directory, because creating an indexing driver will
260 if (Directory
.Exists (arg_output
)) {
262 foreach (FileInfo info
in DirectoryWalker
.GetFileInfos (arg_output
)) {
263 if (Array
.IndexOf (allowed_files
, info
.Name
) == -1) {
264 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle file {1} was found", arg_output
, info
.FullName
);
265 Environment
.Exit (1);
269 foreach (DirectoryInfo info
in DirectoryWalker
.GetDirectoryInfos (arg_output
)) {
270 if (Array
.IndexOf (allowed_dirs
, info
.Name
) == -1) {
271 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle directory {1} was found", arg_output
, info
.FullName
);
272 Environment
.Exit (1);
277 // Set the IO priority so we don't slow down the system
278 IoPriority
.ReduceIoPriority ();
280 driver
= new LuceneIndexingDriver (arg_output
, false);
281 driver
.TextCache
= (arg_cache_text
) ? new TextCache (arg_output
) : null;
282 if (driver
.TextCache
!= null)
283 driver
.TextCache
.WorldReadable
= true;
285 backing_fa_store
= new FileAttributesStore_Sqlite (driver
.TopDirectory
, driver
.Fingerprint
);
286 fa_store
= new FileAttributesStore (backing_fa_store
);
288 // Set up signal handlers
289 SetupSignalHandlers ();
291 Thread crawl_thread
, index_thread
, monitor_thread
= null;
293 Stopwatch watch
= new Stopwatch ();
296 // Start the thread that does the crawling
297 crawl_thread
= ExceptionHandlingThread
.Start (new ThreadStart (CrawlWorker
));
299 // Start the thread that does the actual indexing
300 index_thread
= ExceptionHandlingThread
.Start (new ThreadStart (IndexWorker
));
302 if (!arg_disable_restart
) {
303 // Start the thread that monitors memory usage.
304 monitor_thread
= ExceptionHandlingThread
.Start (new ThreadStart (MemoryMonitorWorker
));
307 // Join all the threads so that we know that we're the only thread still running
308 crawl_thread
.Join ();
309 index_thread
.Join ();
310 if (monitor_thread
!= null)
311 monitor_thread
.Join ();
314 Logger
.Log
.Debug ("Elapsed time {0}.", watch
);
317 Logger
.Log
.Debug ("Restarting helper");
318 Process p
= new Process ();
319 p
.StartInfo
.UseShellExecute
= false;
320 // FIXME: Maybe this isn't the right way to do things? It should be ok,
321 // the PATH is inherited from the shell script which runs mono itself.
322 p
.StartInfo
.FileName
= "mono";
323 p
.StartInfo
.Arguments
= String
.Join (" ", Environment
.GetCommandLineArgs ());
328 /////////////////////////////////////////////////////////////////
330 static void CrawlWorker ()
332 Logger
.Log
.Debug ("Starting CrawlWorker");
338 while (pending_directories
.Count
> 0) {
339 DirectoryInfo dir
= (DirectoryInfo
) pending_directories
.Dequeue ();
341 if (! arg_disable_directories
)
342 pending_files
.Enqueue (dir
);
346 foreach (DirectoryInfo subdir
in DirectoryWalker
.GetDirectoryInfos (dir
))
348 && !FileSystem
.IsSpecialFile (subdir
.FullName
))
349 pending_directories
.Enqueue (subdir
);
351 foreach (FileInfo file
in DirectoryWalker
.GetFileInfos (dir
))
353 && !FileSystem
.IsSpecialFile (file
.FullName
)) {
354 pending_files
.Enqueue (file
);
358 } catch (DirectoryNotFoundException e
) {}
366 Logger
.Log
.Debug ("Scanned {0} files and directories in {1} directories", count_dirs
+ count_files
, count_dirs
);
368 Logger
.Log
.Debug ("CrawlWorker Done");
374 /////////////////////////////////////////////////////////////////
376 static void AddToRequest (IndexerRequest request
, Indexable indexable
)
378 // Disable filtering and only index file attributes
379 if (arg_disable_filtering
)
380 indexable
.Filtering
= IndexableFiltering
.Never
;
382 // Tag the item for easy identification (for say, removal)
384 indexable
.AddProperty (Property
.NewUnsearched("Tag", arg_tag
));
386 if (arg_source
== null) {
387 DirectoryInfo dir
= new DirectoryInfo (StringFu
.SanitizePath (arg_output
));
388 arg_source
= dir
.Name
;
391 indexable
.Source
= arg_source
;
393 request
.Add (indexable
);
396 static IndexerReceipt
[] FlushIndexer (IIndexer indexer
, IndexerRequest request
)
398 IndexerReceipt
[] receipts
;
399 receipts
= indexer
.Flush (request
);
401 ArrayList pending_children
;
402 pending_children
= new ArrayList ();
404 foreach (IndexerReceipt raw_r
in receipts
) {
406 if (raw_r
is IndexerAddedReceipt
) {
407 // Update the file attributes
408 IndexerAddedReceipt r
= (IndexerAddedReceipt
) raw_r
;
410 Indexable indexable
= request
.GetByUri (r
.Uri
);
412 // We don't need to write out any file attributes for
414 if (indexable
.ParentUri
!= null)
417 string path
= r
.Uri
.LocalPath
;
420 attr
= fa_store
.ReadOrCreate (path
);
422 attr
.LastWriteTime
= indexable
.Timestamp
;
423 attr
.FilterName
= r
.FilterName
;
424 attr
.FilterVersion
= r
.FilterVersion
;
426 fa_store
.Write (attr
);
428 } else if (raw_r
is IndexerRemovedReceipt
) {
429 // Update the file attributes
430 IndexerRemovedReceipt r
= (IndexerRemovedReceipt
) raw_r
;
432 Indexable indexable
= request
.GetByUri (r
.Uri
);
434 string path
= r
.Uri
.LocalPath
;
435 Logger
.Log
.Debug ("Removing: '{0}'", path
);
436 fa_store
.Drop (path
);
438 } else if (raw_r
is IndexerChildIndexablesReceipt
) {
439 // Add any child indexables back into our indexer
440 IndexerChildIndexablesReceipt r
= (IndexerChildIndexablesReceipt
) raw_r
;
441 pending_children
.AddRange (r
.Children
);
445 request
.Clear (); // clear out the old request
446 foreach (Indexable i
in pending_children
) // and then add the children
447 AddToRequest (request
, i
);
452 static Indexable
FileToIndexable (FileInfo file
)
454 if (!file
.Exists
|| Ignore (file
) || fa_store
.IsUpToDate (file
.FullName
))
457 // Create the indexable and add the standard properties we
458 // use in the FileSystemQueryable.
459 Uri uri
= UriFu
.PathToFileUri (file
.FullName
);
460 Indexable indexable
= new Indexable (uri
);
461 indexable
.Timestamp
= file
.LastWriteTimeUtc
;
462 FSQ
.AddStandardPropertiesToIndexable (indexable
, file
.Name
, Guid
.Empty
, false);
464 // Store directory name in the index
465 string dirname
= file
.DirectoryName
;
466 indexable
.AddProperty (Property
.NewUnsearched (ParentDirUriPropKey
, UriFu
.PathToFileUri (dirname
)));
471 static Indexable
DirectoryToIndexable (DirectoryInfo dir
, Queue modified_directories
)
476 // Check if the directory information is stored in attributes store
477 // And if the mtime of the directory is same as that in the attributes store
478 FileAttributes attr
= fa_store
.Read (dir
.FullName
);
480 // If the directory exists in the fa store, then it is already indexed
482 if (arg_delete
&& dir
.LastWriteTimeUtc
> attr
.LastWriteTime
)
483 modified_directories
.Enqueue (dir
);
487 // Create the indexable and add the standard properties we
488 // use in the FileSystemQueryable.
489 Uri uri
= UriFu
.PathToFileUri (dir
.FullName
);
490 Indexable indexable
= new Indexable (uri
);
491 indexable
.MimeType
= "inode/directory";
492 indexable
.NoContent
= true;
493 indexable
.Timestamp
= dir
.LastWriteTimeUtc
;
494 FSQ
.AddStandardPropertiesToIndexable (indexable
, dir
.Name
, Guid
.Empty
, false);
496 // Add directory name property
497 string dirname
= dir
.Parent
.FullName
;
498 indexable
.AddProperty (Property
.NewUnsearched (ParentDirUriPropKey
, UriFu
.PathToFileUri (dirname
)));
500 indexable
.AddProperty (Property
.NewBool (IsDirectoryPropKey
, true));
505 static void IndexWorker ()
507 Logger
.Log
.Debug ("Starting IndexWorker");
508 Queue modified_directories
= new Queue ();
512 IndexerRequest pending_request
;
513 pending_request
= new IndexerRequest ();
516 if (pending_files
.Count
> 0) {
517 Object file_or_dir_info
= pending_files
.Dequeue ();
519 if (file_or_dir_info
is DirectoryInfo
)
520 indexable
= DirectoryToIndexable ((DirectoryInfo
) file_or_dir_info
, modified_directories
);
522 indexable
= FileToIndexable ((FileInfo
) file_or_dir_info
);
524 if (indexable
== null)
527 AddToRequest (pending_request
, indexable
);
529 if (pending_request
.Count
>= BATCH_SIZE
) {
530 Logger
.Log
.Debug ("Flushing driver, {0} items in queue", pending_request
.Count
);
531 FlushIndexer (driver
, pending_request
);
532 // FlushIndexer clears the pending_request
535 } else if (crawling
) {
536 //Logger.Log.Debug ("IndexWorker: La la la...");
543 // Time to remove deleted directories from the index and attributes store
544 while (modified_directories
.Count
> 0) {
545 DirectoryInfo subdir
= (DirectoryInfo
) modified_directories
.Dequeue ();
546 Logger
.Log
.Debug ("Checking {0} for deleted files and directories", subdir
.FullName
);
548 // Get a list of all documents from lucene index with ParentDirUriPropKey set as that of subdir
549 ICollection all_dirent
= GetAllItemsInDirectory (subdir
);
550 foreach (Dirent info
in all_dirent
) {
551 // check if the item exists
552 if (File
.Exists (info
.FullName
) ||
553 (info
.IsDirectory
&& Directory
.Exists (info
.FullName
)))
556 if (info
.IsDirectory
)
557 // Recursively remove deleted subdirectories
558 modified_directories
.Enqueue (new DirectoryInfo (info
.FullName
));
561 Uri uri
= UriFu
.PathToFileUri (info
.FullName
);
562 indexable
= new Indexable (IndexableType
.Remove
, uri
);
563 AddToRequest (pending_request
, indexable
);
567 // Call Flush until our request is empty. We have to do this in a loop
568 // because children can get added back to the pending request in a flush.
569 while (pending_request
.Count
> 0)
570 FlushIndexer (driver
, pending_request
);
572 backing_fa_store
.Flush ();
574 Logger
.Log
.Debug ("Optimizing index");
575 driver
.OptimizeNow ();
577 Logger
.Log
.Debug ("IndexWorker Done");
584 private bool is_directory
;
587 public Dirent (string path
, bool is_dir
)
590 this.is_directory
= is_dir
;
593 public bool IsDirectory
{
594 get { return is_directory; }
601 public string FullName
{
602 get { return path.Substring (7); }
606 private class BitArrayHitCollector
: LNS
.HitCollector
{
608 private BetterBitArray matches
;
610 public BitArrayHitCollector (BetterBitArray matches
)
612 this.matches
= matches
;
615 public override void Collect (int id
, float score
)
621 private const string ParentDirUriPropKey
= "beagle:ParentDirUri";
622 private const string IsDirectoryPropKey
= "beagle:IsDirectory";
624 // Returns a list of all files and directories in dir
625 static ICollection
GetAllItemsInDirectory (DirectoryInfo dir
)
628 string parent_uri_str
= UriFu
.PathToFileUri (dir
.FullName
).ToString ();
629 // Instead of taking the painfull way of using BeagleAnalyzer, lets just add the prefix manually
630 //parent_uri_str = "_:" + parent_uri_str;
631 // LuceneCommon thinks exposing secret property type encoding is bad, I think so too... except for now
632 string key
= "prop:k:" + ParentDirUriPropKey
;
633 //Logger.Log.Debug ("Querying for {0}={1}", parent_uri_str, key);
634 LNS
.Query query
= new LNS
.TermQuery (new Term (key
, parent_uri_str
));
637 LNS
.IndexSearcher searcher
;
638 searcher
= LuceneCommon
.GetSearcher (driver
.PrimaryStore
);
640 BetterBitArray matches
;
641 matches
= new BetterBitArray (searcher
.MaxDoc ());
643 BitArrayHitCollector collector
;
644 collector
= new BitArrayHitCollector (matches
);
646 searcher
.Search (query
, null, collector
);
648 // Finally we pull all of the matching documents,
649 // convert them to Dirent, and store them in a list.
651 ArrayList match_list
= new ArrayList ();
653 while (i
< matches
.Count
) {
655 i
= matches
.GetNextTrueIndex (i
);
656 if (i
>= matches
.Count
)
660 doc
= searcher
.Doc (i
);
663 info
= DocumentToDirent (doc
);
665 match_list
.Add (info
);
670 LuceneCommon
.ReleaseSearcher (searcher
);
671 //Logger.Log.Debug ("Found {0} items in {1}", match_list.Count, dir.FullName);
676 static private Dirent
DocumentToDirent (Document doc
)
681 path
= doc
.Get ("Uri");
683 string prop_key
= "prop:k:" + IsDirectoryPropKey
;
684 foreach (Field f
in doc
.Fields ()) {
685 if (f
.Name () != prop_key
)
688 is_dir
= (f
.StringValue ().Substring (2) == "true");
692 //Logger.Log.Debug ("Found: " + path + " (" + is_dir + ")");
693 return new Dirent (path
, is_dir
);
696 /////////////////////////////////////////////////////////////////
698 static void MemoryMonitorWorker ()
700 int vmrss_original
= SystemInformation
.VmRss
;
702 const double threshold
= 6.0;
705 while (! shutdown
&& (crawling
|| indexing
)) {
707 // Check resident memory usage
708 int vmrss
= SystemInformation
.VmRss
;
709 double size
= vmrss
/ (double) vmrss_original
;
710 if (vmrss
!= last_vmrss
)
711 Logger
.Log
.Debug ("Size: VmRSS={0:0.0} MB, size={1:0.00}, {2:0.0}%",
712 vmrss
/1024.0, size
, 100.0 * (size
- 1) / (threshold
- 1));
714 if (size
> threshold
) {
715 Logger
.Log
.Debug ("Process too big, shutting down!");
725 /////////////////////////////////////////////////////////////////
727 // From BeagleDaemon.cs
729 static void SetupSignalHandlers ()
731 // Force OurSignalHandler to be JITed
732 OurSignalHandler (-1);
734 // Set up our signal handler
735 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGINT
, OurSignalHandler
);
736 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGTERM
, OurSignalHandler
);
737 if (Environment
.GetEnvironmentVariable("BEAGLE_THERE_BE_NO_QUITTIN") == null)
738 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGQUIT
, OurSignalHandler
);
741 static void OurSignalHandler (int signal
)
743 // This allows us to call OurSignalHandler w/o doing anything.
744 // We want to call it once to ensure that it is pre-JITed.
748 Logger
.Log
.Debug ("Shutdown Requested");
752 /////////////////////////////////////////////////////////////////
754 static void PrintUsage ()
757 "beagle-build-index: Build an index.\n" +
758 "Web page: http://www.gnome.org/projects/beagle\n" +
759 "Copyright (C) 2005-2006 Novell, Inc.\n\n";
762 "Usage: beagle-build-index [OPTIONS] --target <index_path> <path> [path]\n\n" +
765 "beagle-build-index will *delete all existing data* within the target\n" +
766 "directory. Ensure that the target path is set correctly before running.\n\n" +
769 " --source [name]\t\tThe index's source name. Defaults to the target directory name\n" +
770 // FIXME: remap doesnt seem to be implemented !
771 // Implementing remap might some fixes to --enable-deletion, see IndexWorker
772 //" --remap [path1:path2]\t\tRemap data paths to fit target. \n" +
773 " --tag [tag]\t\t\tTag index data for identification.\n" +
774 " --recursive\t\t\tCrawl source path recursivly.\n" +
775 " --enable-deletion\t\tRemove deleted files and directories from index.\n" +
776 " --enable-text-cache\t\tBuild text-cache of documents used for snippets.\n" +
777 " --disable-directories\t\tDon't add directories to the index.\n" +
778 " --disable-filtering\t\tDisable all filtering of files. Only index attributes.\n" +
779 " --allow-pattern [pattern]\tOnly allow files that match the pattern to be indexed.\n" +
780 " --deny-pattern [pattern]\tKeep any files that match the pattern from being indexed.\n" +
781 " --disable-restart\t\tDon't restart when memory usage gets above a certain threshold.\n" +
782 " --debug\t\t\tEcho verbose debugging information.\n\n";
785 Console
.WriteLine (usage
);
786 Environment
.Exit (0);
789 /////////////////////////////////////////////////////////
791 static Uri
RemapUri (Uri uri
)
793 // FIXME: This is ghetto
794 foreach (DictionaryEntry dict
in remap_table
) {
795 if (uri
.LocalPath
.IndexOf ((string) dict
.Key
) == -1)
797 return new Uri (uri
.LocalPath
.Replace ((string) dict
.Key
, (string) dict
.Value
));
802 static bool Ignore (DirectoryInfo directory
)
804 if (directory
.Name
.StartsWith ("."))
810 static bool Ignore (FileInfo file
)
812 if (file
.Name
.StartsWith ("."))
815 if (FileSystem
.IsSpecialFile (file
.FullName
))
818 if (allowed_patterns
.Count
> 0) {
819 foreach (ExcludeItem pattern
in allowed_patterns
)
820 if (pattern
.IsMatch (file
.Name
))
826 foreach (ExcludeItem pattern
in denied_patterns
)
827 if (pattern
.IsMatch (file
.Name
))
830 // FIXME: Add more stuff here