4 // Copyright (C) 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 using System
.Collections
;
29 using System
.Diagnostics
;
32 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Documents
;
38 using Lucene
.Net
.Index
;
39 using LNS
= Lucene
.Net
.Search
;
43 using FSQ
= Beagle
.Daemon
.FileSystemQueryable
.FileSystemQueryable
;
45 namespace Beagle
.Daemon
49 static string [] argv
;
51 static bool arg_recursive
= false, arg_delete
= false, arg_debug
= false, arg_cache_text
= false, arg_disable_filtering
= false, arg_disable_restart
= false;
53 static Hashtable remap_table
= new Hashtable ();
55 static string arg_output
, arg_tag
, arg_source
;
57 /////////////////////////////////////////////////////////
59 // Files and directories that are allowed to be in the target
60 // directory before we blow it away. If we encounter any file
61 // or dir not in this list, we'll bail out.
62 static string [] allowed_files
= {
63 "FileAttributesStore.db",
68 static string [] allowed_dirs
= {
75 /////////////////////////////////////////////////////////
77 static FileAttributesStore_Sqlite backing_fa_store
;
78 static FileAttributesStore fa_store
;
80 static LuceneIndexingDriver driver
;
82 static bool crawling
= true, indexing
= true, shutdown
= false, restart
= false;
84 static ArrayList allowed_patterns
= new ArrayList ();
85 static ArrayList denied_patterns
= new ArrayList ();
87 static Queue pending_files
= new Queue ();
88 static Queue pending_directories
= new Queue ();
90 const int BATCH_SIZE
= 30;
92 /////////////////////////////////////////////////////////
94 static void Main (string [] args
)
98 } catch (Exception ex
) {
99 Logger
.Log
.Error (ex
, "Unhandled exception thrown. Exiting immediately.");
100 Environment
.Exit (1);
104 static void DoMain (string [] args
)
106 SystemInformation
.SetProcessName ("beagle-build-index");
112 while (i
< args
.Length
) {
114 string arg
= args
[i
];
116 string next_arg
= i
< args
.Length
? args
[i
] : null;
125 if (next_arg
!= null)
132 arg_recursive
= true;
135 case "--enable-deletion":
139 case "--enable-text-cache":
140 arg_cache_text
= true;
145 if (next_arg == null)
148 int j = next_arg.IndexOf (":");
151 Logger.Log.Error ("Invalid remap argument: {0}", next_arg);
152 Environment.Exit (1);
155 remap_table [next_arg.Substring (0, j)] = next_arg.Substring (j+1);
161 if (next_arg
!= null)
162 arg_output
= Path
.IsPathRooted (next_arg
) ? next_arg
: Path
.GetFullPath (next_arg
);
166 case "--disable-filtering":
167 arg_disable_filtering
= true;
170 case "--allow-pattern":
171 if (next_arg
== null)
174 if (next_arg
.IndexOf (',') != -1) {
175 foreach (string pattern
in next_arg
.Split (','))
176 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
179 allowed_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
185 case "--deny-pattern":
186 if (next_arg
== null)
189 if (next_arg
.IndexOf (',') != -1) {
190 foreach (string pattern
in next_arg
.Split (','))
191 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, pattern
));
194 denied_patterns
.Add (new ExcludeItem (ExcludeType
.Pattern
, next_arg
));
200 case "--disable-restart":
201 arg_disable_restart
= true;
205 if (next_arg
== null)
208 arg_source
= next_arg
;
213 string path
= Path
.IsPathRooted (arg
) ? arg
: Path
.GetFullPath (arg
);
214 if (path
!= "/" && path
.EndsWith ("/"))
215 path
= path
.TrimEnd ('/');
217 if (Directory
.Exists (path
))
218 pending_directories
.Enqueue (new DirectoryInfo (path
));
219 else if (File
.Exists (path
))
220 pending_files
.Enqueue (new FileInfo (path
));
227 /////////////////////////////////////////////////////////
229 if (arg_output
== null) {
230 Logger
.Log
.Error ("--target must be specified");
231 Environment
.Exit (1);
234 foreach (FileSystemInfo info
in pending_directories
) {
235 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
236 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
237 Environment
.Exit (1);
241 foreach (FileSystemInfo info
in pending_files
) {
242 if (Path
.GetFullPath (arg_output
) == info
.FullName
) {
243 Logger
.Log
.Error ("Target directory cannot be one of the source paths.");
244 Environment
.Exit (1);
248 if (!Directory
.Exists (Path
.GetDirectoryName (arg_output
))) {
249 Logger
.Log
.Error ("Index directory not available for construction: {0}", arg_output
);
250 Environment
.Exit (1);
253 // Be *EXTRA PARANOID* about the contents of the target
254 // directory, because creating an indexing driver will
256 if (Directory
.Exists (arg_output
)) {
258 foreach (FileInfo info
in DirectoryWalker
.GetFileInfos (arg_output
)) {
259 if (Array
.IndexOf (allowed_files
, info
.Name
) == -1) {
260 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle file {1} was found", arg_output
, info
.FullName
);
261 Environment
.Exit (1);
265 foreach (DirectoryInfo info
in DirectoryWalker
.GetDirectoryInfos (arg_output
)) {
266 if (Array
.IndexOf (allowed_dirs
, info
.Name
) == -1) {
267 Logger
.Log
.Error ("{0} doesn't look safe to delete: non-Beagle directory {1} was found", arg_output
, info
.FullName
);
268 Environment
.Exit (1);
273 // Set the IO priority so we don't slow down the system
274 if (! IoPriority
.SetIdle ())
275 IoPriority
.SetIoPriority (7);
277 driver
= new LuceneIndexingDriver (arg_output
, false);
278 driver
.TextCache
= (arg_cache_text
) ? new TextCache (arg_output
) : null;
279 if (driver
.TextCache
!= null)
280 driver
.TextCache
.WorldReadable
= true;
282 backing_fa_store
= new FileAttributesStore_Sqlite (driver
.TopDirectory
, driver
.Fingerprint
);
283 fa_store
= new FileAttributesStore (backing_fa_store
);
285 // Set up signal handlers
286 SetupSignalHandlers ();
288 Thread crawl_thread
, index_thread
, monitor_thread
= null;
290 Stopwatch watch
= new Stopwatch ();
293 // Start the thread that does the crawling
294 crawl_thread
= ExceptionHandlingThread
.Start (new ThreadStart (CrawlWorker
));
296 // Start the thread that does the actual indexing
297 index_thread
= ExceptionHandlingThread
.Start (new ThreadStart (IndexWorker
));
299 if (!arg_disable_restart
) {
300 // Start the thread that monitors memory usage.
301 monitor_thread
= ExceptionHandlingThread
.Start (new ThreadStart (MemoryMonitorWorker
));
304 // Join all the threads so that we know that we're the only thread still running
305 crawl_thread
.Join ();
306 index_thread
.Join ();
307 if (monitor_thread
!= null)
308 monitor_thread
.Join ();
311 Logger
.Log
.Debug ("Elapsed time {0}.", watch
);
314 Logger
.Log
.Debug ("Restarting helper");
315 Process p
= new Process ();
316 p
.StartInfo
.UseShellExecute
= false;
317 // FIXME: Maybe this isn't the right way to do things? It should be ok,
318 // the PATH is inherited from the shell script which runs mono itself.
319 p
.StartInfo
.FileName
= "mono";
320 p
.StartInfo
.Arguments
= String
.Join (" ", Environment
.GetCommandLineArgs ());
325 /////////////////////////////////////////////////////////////////
327 static void CrawlWorker ()
329 Logger
.Log
.Debug ("Starting CrawlWorker");
335 while (pending_directories
.Count
> 0) {
336 DirectoryInfo dir
= (DirectoryInfo
) pending_directories
.Dequeue ();
337 pending_files
.Enqueue (dir
);
341 foreach (DirectoryInfo subdir
in DirectoryWalker
.GetDirectoryInfos (dir
))
343 && !FileSystem
.IsSpecialFile (subdir
.FullName
))
344 pending_directories
.Enqueue (subdir
);
346 foreach (FileInfo file
in DirectoryWalker
.GetFileInfos (dir
))
348 && !FileSystem
.IsSpecialFile (file
.FullName
)) {
349 pending_files
.Enqueue (file
);
353 } catch (DirectoryNotFoundException e
) {}
361 Logger
.Log
.Debug ("Scanned {0} files and directories in {1} directories", count_dirs
+ count_files
, count_dirs
);
363 Logger
.Log
.Debug ("CrawlWorker Done");
369 /////////////////////////////////////////////////////////////////
371 static void AddToRequest (IndexerRequest request
, Indexable indexable
)
373 // Disable filtering and only index file attributes
374 if (arg_disable_filtering
)
375 indexable
.Filtering
= IndexableFiltering
.Never
;
377 // Tag the item for easy identification (for say, removal)
379 indexable
.AddProperty (Property
.NewUnsearched("Tag", arg_tag
));
381 if (arg_source
== null) {
382 DirectoryInfo dir
= new DirectoryInfo (StringFu
.SanitizePath (arg_output
));
383 arg_source
= dir
.Name
;
386 indexable
.Source
= arg_source
;
388 request
.Add (indexable
);
391 static IndexerReceipt
[] FlushIndexer (IIndexer indexer
, IndexerRequest request
)
393 IndexerReceipt
[] receipts
;
394 receipts
= indexer
.Flush (request
);
396 ArrayList pending_children
;
397 pending_children
= new ArrayList ();
399 foreach (IndexerReceipt raw_r
in receipts
) {
401 if (raw_r
is IndexerAddedReceipt
) {
402 // Update the file attributes
403 IndexerAddedReceipt r
= (IndexerAddedReceipt
) raw_r
;
405 Indexable indexable
= request
.GetByUri (r
.Uri
);
407 // We don't need to write out any file attributes for
409 if (indexable
.ParentUri
!= null)
412 string path
= r
.Uri
.LocalPath
;
415 attr
= fa_store
.ReadOrCreate (path
);
417 attr
.LastWriteTime
= indexable
.Timestamp
;
418 attr
.FilterName
= r
.FilterName
;
419 attr
.FilterVersion
= r
.FilterVersion
;
421 fa_store
.Write (attr
);
423 } else if (raw_r
is IndexerRemovedReceipt
) {
424 // Update the file attributes
425 IndexerRemovedReceipt r
= (IndexerRemovedReceipt
) raw_r
;
427 Indexable indexable
= request
.GetByUri (r
.Uri
);
429 string path
= r
.Uri
.LocalPath
;
430 Logger
.Log
.Debug ("Removing: '{0}'", path
);
431 fa_store
.Drop (path
);
433 } else if (raw_r
is IndexerChildIndexablesReceipt
) {
434 // Add any child indexables back into our indexer
435 IndexerChildIndexablesReceipt r
= (IndexerChildIndexablesReceipt
) raw_r
;
436 pending_children
.AddRange (r
.Children
);
440 request
.Clear (); // clear out the old request
441 foreach (Indexable i
in pending_children
) // and then add the children
442 AddToRequest (request
, i
);
447 static Indexable
FileToIndexable (FileInfo file
)
449 if (!file
.Exists
|| Ignore (file
) || fa_store
.IsUpToDate (file
.FullName
))
452 // Create the indexable and add the standard properties we
453 // use in the FileSystemQueryable.
454 Uri uri
= UriFu
.PathToFileUri (file
.FullName
);
455 Indexable indexable
= new Indexable (uri
);
456 indexable
.Timestamp
= file
.LastWriteTimeUtc
;
457 FSQ
.AddStandardPropertiesToIndexable (indexable
, file
.Name
, Guid
.Empty
, false);
459 // Store directory name in the index
460 string dirname
= file
.DirectoryName
;
461 indexable
.AddProperty (Property
.NewUnsearched (ParentDirUriPropKey
, UriFu
.PathToFileUri (dirname
)));
466 static Indexable
DirectoryToIndexable (DirectoryInfo dir
, Queue modified_directories
)
471 // Check if the directory information is stored in attributes store
472 // And if the mtime of the directory is same as that in the attributes store
473 FileAttributes attr
= fa_store
.Read (dir
.FullName
);
475 // If the directory exists in the fa store, then it is already indexed
477 if (arg_delete
&& dir
.LastWriteTimeUtc
> attr
.LastWriteTime
)
478 modified_directories
.Enqueue (dir
);
482 // Create the indexable and add the standard properties we
483 // use in the FileSystemQueryable.
484 Uri uri
= UriFu
.PathToFileUri (dir
.FullName
);
485 Indexable indexable
= new Indexable (uri
);
486 indexable
.MimeType
= "inode/directory";
487 indexable
.NoContent
= true;
488 indexable
.Timestamp
= dir
.LastWriteTimeUtc
;
489 FSQ
.AddStandardPropertiesToIndexable (indexable
, dir
.Name
, Guid
.Empty
, false);
491 // Add directory name property
492 string dirname
= dir
.Parent
.FullName
;
493 indexable
.AddProperty (Property
.NewUnsearched (ParentDirUriPropKey
, UriFu
.PathToFileUri (dirname
)));
495 indexable
.AddProperty (Property
.NewBool (IsDirectoryPropKey
, true));
500 static void IndexWorker ()
502 Logger
.Log
.Debug ("Starting IndexWorker");
503 Queue modified_directories
= new Queue ();
507 IndexerRequest pending_request
;
508 pending_request
= new IndexerRequest ();
511 if (pending_files
.Count
> 0) {
512 Object file_or_dir_info
= pending_files
.Dequeue ();
514 if (file_or_dir_info
is DirectoryInfo
)
515 indexable
= DirectoryToIndexable ((DirectoryInfo
) file_or_dir_info
, modified_directories
);
517 indexable
= FileToIndexable ((FileInfo
) file_or_dir_info
);
519 if (indexable
== null)
522 AddToRequest (pending_request
, indexable
);
524 if (pending_request
.Count
>= BATCH_SIZE
) {
525 Logger
.Log
.Debug ("Flushing driver, {0} items in queue", pending_request
.Count
);
526 FlushIndexer (driver
, pending_request
);
527 // FlushIndexer clears the pending_request
530 } else if (crawling
) {
531 //Logger.Log.Debug ("IndexWorker: La la la...");
538 // Time to remove deleted directories from the index and attributes store
539 while (modified_directories
.Count
> 0) {
540 DirectoryInfo subdir
= (DirectoryInfo
) modified_directories
.Dequeue ();
541 Logger
.Log
.Debug ("Checking {0} for deleted files and directories", subdir
.FullName
);
543 // Get a list of all documents from lucene index with ParentDirUriPropKey set as that of subdir
544 ICollection all_dirent
= GetAllItemsInDirectory (subdir
);
545 foreach (Dirent info
in all_dirent
) {
546 // check if the item exists
547 if (File
.Exists (info
.FullName
) ||
548 (info
.IsDirectory
&& Directory
.Exists (info
.FullName
)))
551 if (info
.IsDirectory
)
552 // Recursively remove deleted subdirectories
553 modified_directories
.Enqueue (new DirectoryInfo (info
.FullName
));
556 Uri uri
= UriFu
.PathToFileUri (info
.FullName
);
557 indexable
= new Indexable (IndexableType
.Remove
, uri
);
558 AddToRequest (pending_request
, indexable
);
562 // Call Flush until our request is empty. We have to do this in a loop
563 // because children can get added back to the pending request in a flush.
564 while (pending_request
.Count
> 0)
565 FlushIndexer (driver
, pending_request
);
567 backing_fa_store
.Flush ();
569 Logger
.Log
.Debug ("Optimizing index");
570 driver
.OptimizeNow ();
572 Logger
.Log
.Debug ("IndexWorker Done");
579 private bool is_directory
;
582 public Dirent (string path
, bool is_dir
)
585 this.is_directory
= is_dir
;
588 public bool IsDirectory
{
589 get { return is_directory; }
596 public string FullName
{
597 get { return path.Substring (7); }
601 private class BitArrayHitCollector
: LNS
.HitCollector
{
603 private BetterBitArray matches
;
605 public BitArrayHitCollector (BetterBitArray matches
)
607 this.matches
= matches
;
610 public override void Collect (int id
, float score
)
616 private const string ParentDirUriPropKey
= "beagle:ParentDirUri";
617 private const string IsDirectoryPropKey
= "beagle:IsDirectory";
619 // Returns a list of all files and directories in dir
620 static ICollection
GetAllItemsInDirectory (DirectoryInfo dir
)
623 string parent_uri_str
= UriFu
.PathToFileUri (dir
.FullName
).ToString ();
624 // Instead of taking the painfull way of using BeagleAnalyzer, lets just add the prefix manually
625 //parent_uri_str = "_:" + parent_uri_str;
626 // LuceneCommon thinks exposing secret property type encoding is bad, I think so too... except for now
627 string key
= "prop:k:" + ParentDirUriPropKey
;
628 //Logger.Log.Debug ("Querying for {0}={1}", parent_uri_str, key);
629 LNS
.Query query
= new LNS
.TermQuery (new Term (key
, parent_uri_str
));
632 LNS
.IndexSearcher searcher
;
633 searcher
= LuceneCommon
.GetSearcher (driver
.PrimaryStore
);
635 BetterBitArray matches
;
636 matches
= new BetterBitArray (searcher
.MaxDoc ());
638 BitArrayHitCollector collector
;
639 collector
= new BitArrayHitCollector (matches
);
641 searcher
.Search (query
, null, collector
);
643 // Finally we pull all of the matching documents,
644 // convert them to Dirent, and store them in a list.
646 ArrayList match_list
= new ArrayList ();
648 while (i
< matches
.Count
) {
650 i
= matches
.GetNextTrueIndex (i
);
651 if (i
>= matches
.Count
)
655 doc
= searcher
.Doc (i
);
658 info
= DocumentToDirent (doc
);
660 match_list
.Add (info
);
665 LuceneCommon
.ReleaseSearcher (searcher
);
666 //Logger.Log.Debug ("Found {0} items in {1}", match_list.Count, dir.FullName);
671 static private Dirent
DocumentToDirent (Document doc
)
676 path
= doc
.Get ("Uri");
678 string prop_key
= "prop:k:" + IsDirectoryPropKey
;
679 foreach (Field f
in doc
.Fields ()) {
680 if (f
.Name () != prop_key
)
683 is_dir
= (f
.StringValue ().Substring (2) == "true");
687 //Logger.Log.Debug ("Found: " + path + " (" + is_dir + ")");
688 return new Dirent (path
, is_dir
);
691 /////////////////////////////////////////////////////////////////
693 static void MemoryMonitorWorker ()
695 int vmrss_original
= SystemInformation
.VmRss
;
697 const double threshold
= 6.0;
700 while (! shutdown
&& (crawling
|| indexing
)) {
702 // Check resident memory usage
703 int vmrss
= SystemInformation
.VmRss
;
704 double size
= vmrss
/ (double) vmrss_original
;
705 if (vmrss
!= last_vmrss
)
706 Logger
.Log
.Debug ("Size: VmRSS={0:0.0} MB, size={1:0.00}, {2:0.0}%",
707 vmrss
/1024.0, size
, 100.0 * (size
- 1) / (threshold
- 1));
709 if (size
> threshold
) {
710 Logger
.Log
.Debug ("Process too big, shutting down!");
720 /////////////////////////////////////////////////////////////////
722 // From BeagleDaemon.cs
724 static void SetupSignalHandlers ()
726 // Force OurSignalHandler to be JITed
727 OurSignalHandler (-1);
729 // Set up our signal handler
730 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGINT
, OurSignalHandler
);
731 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGTERM
, OurSignalHandler
);
732 if (Environment
.GetEnvironmentVariable("BEAGLE_THERE_BE_NO_QUITTIN") == null)
733 Mono
.Unix
.Native
.Stdlib
.signal (Mono
.Unix
.Native
.Signum
.SIGQUIT
, OurSignalHandler
);
736 static void OurSignalHandler (int signal
)
738 // This allows us to call OurSignalHandler w/o doing anything.
739 // We want to call it once to ensure that it is pre-JITed.
743 Logger
.Log
.Debug ("Shutdown Requested");
747 /////////////////////////////////////////////////////////////////
749 static void PrintUsage ()
752 "beagle-build-index: Build an index.\n" +
753 "Web page: http://www.gnome.org/projects/beagle\n" +
754 "Copyright (C) 2005-2006 Novell, Inc.\n\n";
757 "Usage: beagle-build-index [OPTIONS] --target <index_path> <path> [path]\n\n" +
760 "beagle-build-index will *delete all existing data* within the target\n" +
761 "directory. Ensure that the target path is set correctly before running.\n\n" +
764 " --source [name]\t\tThe index's source name. Defaults to the target directory name\n" +
765 // FIXME: remap doesnt seem to be implemented !
766 // Implementing remap might some fixes to --enable-deletion, see IndexWorker
767 //" --remap [path1:path2]\t\tRemap data paths to fit target. \n" +
768 " --tag [tag]\t\t\tTag index data for identification.\n" +
769 " --recursive\t\t\tCrawl source path recursivly.\n" +
770 " --enable-deletion\t\tRemove deleted files and directories from index.\n" +
771 " --enable-text-cache\t\tBuild text-cache of documents used for snippets.\n" +
772 " --disable-filtering\t\tDisable all filtering of files. Only index attributes.\n" +
773 " --allow-pattern [pattern]\tOnly allow files that match the pattern to be indexed.\n" +
774 " --deny-pattern [pattern]\tKeep any files that match the pattern from being indexed.\n" +
775 " --disable-restart\t\tDon't restart when memory usage gets above a certain threshold.\n" +
776 " --debug\t\t\tEcho verbose debugging information.\n\n";
779 Console
.WriteLine (usage
);
780 Environment
.Exit (0);
783 /////////////////////////////////////////////////////////
785 static Uri
RemapUri (Uri uri
)
787 // FIXME: This is ghetto
788 foreach (DictionaryEntry dict
in remap_table
) {
789 if (uri
.LocalPath
.IndexOf ((string) dict
.Key
) == -1)
791 return new Uri (uri
.LocalPath
.Replace ((string) dict
.Key
, (string) dict
.Value
));
796 static bool Ignore (DirectoryInfo directory
)
798 if (directory
.Name
.StartsWith ("."))
804 static bool Ignore (FileInfo file
)
806 if (file
.Name
.StartsWith ("."))
809 if (FileSystem
.IsSpecialFile (file
.FullName
))
812 if (allowed_patterns
.Count
> 0) {
813 foreach (ExcludeItem pattern
in allowed_patterns
)
814 if (pattern
.IsMatch (file
.Name
))
820 foreach (ExcludeItem pattern
in denied_patterns
)
821 if (pattern
.IsMatch (file
.Name
))
824 // FIXME: Add more stuff here