Oops, fix a broken part of the patch
[beagle.git] / beagled / BuildIndex.cs
blobb5eb602932a5d08b42d6df39f29a26bba8d502ac
1 //
2 // BuildIndex.cs
3 //
4 // Copyright (C) 2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 // SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.IO;
31 using System.Net;
32 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Beagle;
38 using Beagle.Util;
39 using FSQ = Beagle.Daemon.FileSystemQueryable.FileSystemQueryable;
41 namespace Beagle.Daemon
43 class BuildIndex
45 static string [] argv;
47 static bool arg_recursive = false, arg_debug = false, arg_cache_text = false, arg_disable_filtering = false, arg_disable_restart = false;
49 static Hashtable remap_table = new Hashtable ();
51 static string arg_output, arg_tag, arg_source;
53 /////////////////////////////////////////////////////////
55 // Files and directories that are allowed to be in the target
56 // directory before we blow it away. If we encounter any file
57 // or dir not in this list, we'll bail out.
58 static string [] allowed_files = {
59 "FileAttributesStore.db",
60 "fingerprint",
61 "version"
64 static string [] allowed_dirs = {
65 "Locks",
66 "PrimaryIndex",
67 "SecondaryIndex",
68 "TextCache"
71 /////////////////////////////////////////////////////////
73 static FileAttributesStore_Sqlite backing_fa_store;
74 static FileAttributesStore fa_store;
76 static LuceneIndexingDriver driver;
78 static bool crawling = true, indexing = true, shutdown = false, restart = false;
80 static ArrayList allowed_patterns = new ArrayList ();
81 static ArrayList denied_patterns = new ArrayList ();
83 static Queue pending_files = new Queue ();
84 static Queue pending_directories = new Queue ();
86 const int BATCH_SIZE = 30;
88 /////////////////////////////////////////////////////////
90 static void Main (string [] args)
92 try {
93 DoMain (args);
94 } catch (Exception ex) {
95 Logger.Log.Error ("Unhandled exception thrown. Exiting immediately.");
96 Logger.Log.Error (ex);
97 Environment.Exit (1);
101 static void DoMain (string [] args)
103 SystemInformation.SetProcessName ("beagle-build-index");
105 if (args.Length < 2)
106 PrintUsage ();
108 int i = 0;
109 while (i < args.Length) {
111 string arg = args [i];
112 ++i;
113 string next_arg = i < args.Length ? args [i] : null;
115 switch (arg) {
116 case "-h":
117 case "--help":
118 PrintUsage ();
119 break;
121 case "--tag":
122 if (next_arg != null)
123 arg_tag = next_arg;
124 ++i;
125 break;
127 case "-r":
128 case "--recursive":
129 arg_recursive = true;
130 break;
132 case "--enable-text-cache":
133 arg_cache_text = true;
134 break;
136 case "--remap":
137 if (next_arg == null)
138 break;
140 int j = next_arg.IndexOf (":");
142 if (j == -1) {
143 Logger.Log.Error ("Invalid remap argument: {0}", next_arg);
144 Environment.Exit (1);
147 remap_table [next_arg.Substring (0, j)] = next_arg.Substring (j+1);
149 ++i;
150 break;
152 case "--target":
153 if (next_arg != null)
154 arg_output = Path.IsPathRooted (next_arg) ? next_arg : Path.GetFullPath (next_arg);
155 ++i;
156 break;
158 case "--disable-filtering":
159 arg_disable_filtering = true;
160 break;
162 case "--allow-pattern":
163 if (next_arg == null)
164 break;
166 if (next_arg.IndexOf (',') != -1) {
167 foreach (string pattern in next_arg.Split (','))
168 allowed_patterns.Add (new ExcludeItem (ExcludeType.Pattern, pattern));
170 } else {
171 allowed_patterns.Add (new ExcludeItem (ExcludeType.Pattern, next_arg));
174 ++i;
175 break;
177 case "--deny-pattern":
178 if (next_arg == null)
179 break;
181 if (next_arg.IndexOf (',') != -1) {
182 foreach (string pattern in next_arg.Split (','))
183 denied_patterns.Add (new ExcludeItem (ExcludeType.Pattern, pattern));
185 } else {
186 denied_patterns.Add (new ExcludeItem (ExcludeType.Pattern, next_arg));
189 ++i;
190 break;
192 case "--disable-restart":
193 arg_disable_restart = true;
194 break;
196 case "--source":
197 if (next_arg == null)
198 break;
200 arg_source = next_arg;
201 ++i;
202 break;
204 default:
205 string path = Path.IsPathRooted (arg) ? arg : Path.GetFullPath (arg);
207 if (Directory.Exists (path))
208 pending_directories.Enqueue (new DirectoryInfo (path));
209 else if (File.Exists (path))
210 pending_files.Enqueue (new FileInfo (path));
211 break;
215 argv = args;
217 /////////////////////////////////////////////////////////
219 if (arg_output == null) {
220 Logger.Log.Error ("--target must be specified");
221 Environment.Exit (1);
224 foreach (FileSystemInfo info in pending_directories) {
225 if (Path.GetFullPath (arg_output) == info.FullName) {
226 Logger.Log.Error ("Target directory cannot be one of the source paths.");
227 Environment.Exit (1);
231 foreach (FileSystemInfo info in pending_files) {
232 if (Path.GetFullPath (arg_output) == info.FullName) {
233 Logger.Log.Error ("Target directory cannot be one of the source paths.");
234 Environment.Exit (1);
238 if (!Directory.Exists (Path.GetDirectoryName (arg_output))) {
239 Logger.Log.Error ("Index directory not available for construction: {0}", arg_output);
240 Environment.Exit (1);
243 // Be *EXTRA PARANOID* about the contents of the target
244 // directory, because creating an indexing driver will
245 // nuke it.
246 if (Directory.Exists (arg_output)) {
248 foreach (FileInfo info in DirectoryWalker.GetFileInfos (arg_output)) {
249 if (Array.IndexOf (allowed_files, info.Name) == -1) {
250 Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagle file {1} was found", arg_output, info.FullName);
251 Environment.Exit (1);
255 foreach (DirectoryInfo info in DirectoryWalker.GetDirectoryInfos (arg_output)) {
256 if (Array.IndexOf (allowed_dirs, info.Name) == -1) {
257 Logger.Log.Error ("{0} doesn't look safe to delete: non-Beagle directory {1} was found", arg_output, info.FullName);
258 Environment.Exit (1);
263 // Set the IO priority to idle so we don't slow down the system
264 IoPriority.SetIdle ();
266 driver = new LuceneIndexingDriver (arg_output, false);
267 driver.TextCache = (arg_cache_text) ? new TextCache (arg_output) : null;
268 if (driver.TextCache != null)
269 driver.TextCache.WorldReadable = true;
271 backing_fa_store = new FileAttributesStore_Sqlite (driver.TopDirectory, driver.Fingerprint);
272 fa_store = new FileAttributesStore (backing_fa_store);
274 // Set up signal handlers
275 SetupSignalHandlers ();
277 Thread crawl_thread, index_thread, monitor_thread = null;
279 // Start the thread that does the crawling
280 crawl_thread = ExceptionHandlingThread.Start (new ThreadStart (CrawlWorker));
282 // Start the thread that does the actual indexing
283 index_thread = ExceptionHandlingThread.Start (new ThreadStart (IndexWorker));
285 if (!arg_disable_restart) {
286 // Start the thread that monitors memory usage.
287 monitor_thread = ExceptionHandlingThread.Start (new ThreadStart (MemoryMonitorWorker));
290 // Join all the threads so that we know that we're the only thread still running
291 crawl_thread.Join ();
292 index_thread.Join ();
293 if (monitor_thread != null)
294 monitor_thread.Join ();
296 if (restart) {
297 Logger.Log.Debug ("Restarting helper");
298 Process p = new Process ();
299 p.StartInfo.UseShellExecute = false;
300 // FIXME: Maybe this isn't the right way to do things? It should be ok,
301 // the PATH is inherited from the shell script which runs mono itself.
302 p.StartInfo.FileName = "mono";
303 p.StartInfo.Arguments = String.Join (" ", Environment.GetCommandLineArgs ());
304 p.Start ();
308 /////////////////////////////////////////////////////////////////
310 static void CrawlWorker ()
312 Logger.Log.Debug ("Starting CrawlWorker");
314 try {
315 int count_dirs = 0;
317 while (pending_directories.Count > 0) {
318 DirectoryInfo dir = (DirectoryInfo) pending_directories.Dequeue ();
320 try {
321 if (arg_recursive)
322 foreach (DirectoryInfo subdir in DirectoryWalker.GetDirectoryInfos (dir))
323 if (!Ignore (subdir)
324 && !FileSystem.IsSymLink (subdir.FullName))
325 pending_directories.Enqueue (subdir);
327 foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir))
328 if (!Ignore (file)
329 && !FileSystem.IsSymLink (file.FullName))
330 pending_files.Enqueue (file);
332 } catch (DirectoryNotFoundException e) {}
334 if (shutdown)
335 break;
337 count_dirs++;
340 Logger.Log.Debug ("Scanned {0} files in {1} directories", pending_files.Count, count_dirs);
341 } finally {
342 Logger.Log.Debug ("CrawlWorker Done");
344 crawling = false;
348 /////////////////////////////////////////////////////////////////
350 static void AddToRequest (IndexerRequest request, Indexable indexable)
352 // Disable filtering and only index file attributes
353 if (arg_disable_filtering)
354 indexable.Filtering = IndexableFiltering.Never;
356 // Tag the item for easy identification (for say, removal)
357 if (arg_tag != null)
358 indexable.AddProperty (Property.NewUnsearched("Tag", arg_tag));
360 if (arg_source == null) {
361 DirectoryInfo dir = new DirectoryInfo (StringFu.SanitizePath (arg_output));
362 arg_source = dir.Name;
365 indexable.Source = arg_source;
367 request.Add (indexable);
370 static IndexerReceipt [] FlushIndexer (IIndexer indexer, IndexerRequest request)
372 IndexerReceipt [] receipts;
373 receipts = indexer.Flush (request);
375 ArrayList pending_children;
376 pending_children = new ArrayList ();
378 foreach (IndexerReceipt raw_r in receipts) {
380 if (raw_r is IndexerAddedReceipt) {
381 // Update the file attributes
382 IndexerAddedReceipt r = (IndexerAddedReceipt) raw_r;
384 Indexable indexable = request.GetByUri (r.Uri);
386 // We don't need to write out any file attributes for
387 // children.
388 if (indexable.ParentUri != null)
389 continue;
391 string path = r.Uri.LocalPath;
393 FileAttributes attr;
394 attr = fa_store.ReadOrCreate (path);
396 attr.LastWriteTime = indexable.Timestamp;
397 attr.FilterName = r.FilterName;
398 attr.FilterVersion = r.FilterVersion;
400 fa_store.Write (attr);
402 } else if (raw_r is IndexerChildIndexablesReceipt) {
403 // Add any child indexables back into our indexer
404 IndexerChildIndexablesReceipt r = (IndexerChildIndexablesReceipt) raw_r;
405 pending_children.AddRange (r.Children);
409 request.Clear (); // clear out the old request
410 foreach (Indexable i in pending_children) // and then add the children
411 AddToRequest (request, i);
413 return receipts;
416 static void IndexWorker ()
418 Logger.Log.Debug ("Starting IndexWorker");
420 try {
421 Indexable indexable;
422 IndexerRequest pending_request;
423 pending_request = new IndexerRequest ();
425 while (!shutdown) {
426 if (pending_files.Count > 0) {
427 FileInfo file = (FileInfo) pending_files.Dequeue ();
428 Uri uri = UriFu.PathToFileUri (file.FullName);
430 // Check that we really should be indexing the file
431 if (!file.Exists || Ignore (file) || fa_store.IsUpToDate (file.FullName))
432 continue;
434 // Create the indexable and add the standard properties we
435 // use in the FileSystemQueryable.
436 indexable = new Indexable (uri);
437 indexable.Timestamp = file.LastWriteTimeUtc;
438 FSQ.AddStandardPropertiesToIndexable (indexable, file.Name, Guid.Empty, false);
440 AddToRequest (pending_request, indexable);
442 if (pending_request.Count >= BATCH_SIZE) {
443 Logger.Log.Debug ("Flushing driver, {0} items in queue", pending_request.Count);
444 FlushIndexer (driver, pending_request);
445 // FlushIndexer clears the pending_request
448 } else if (crawling) {
449 //Logger.Log.Debug ("IndexWorker: La la la...");
450 Thread.Sleep (50);
451 } else {
452 break;
456 // Call Flush until our request is empty. We have to do this in a loop
457 // because children can get added back to the pending request in a flush.
458 while (pending_request.Count > 0)
459 FlushIndexer (driver, pending_request);
461 backing_fa_store.Flush ();
463 driver.OptimizeNow ();
464 } finally {
465 Logger.Log.Debug ("IndexWorker Done");
467 indexing = false;
471 /////////////////////////////////////////////////////////////////
473 static void MemoryMonitorWorker ()
475 int vmrss_original = SystemInformation.VmRss;
477 const double threshold = 5.0;
478 int last_vmrss = 0;
480 while (! shutdown && (crawling || indexing)) {
482 // Check resident memory usage
483 int vmrss = SystemInformation.VmRss;
484 double size = vmrss / (double) vmrss_original;
485 if (vmrss != last_vmrss)
486 Logger.Log.Debug ("Size: VmRSS={0:0.0} MB, size={1:0.00}, {2:0.0}%",
487 vmrss/1024.0, size, 100.0 * (size - 1) / (threshold - 1));
488 last_vmrss = vmrss;
489 if (size > threshold) {
490 Logger.Log.Debug ("Process too big, shutting down!");
491 restart = true;
492 shutdown = true;
493 return;
494 } else {
495 Thread.Sleep (3000);
500 /////////////////////////////////////////////////////////////////
502 // From BeagleDaemon.cs
504 static void SetupSignalHandlers ()
506 // Force OurSignalHandler to be JITed
507 OurSignalHandler (-1);
509 // Set up our signal handler
510 Mono.Unix.Native.Stdlib.signal (Mono.Unix.Native.Signum.SIGINT, OurSignalHandler);
511 Mono.Unix.Native.Stdlib.signal (Mono.Unix.Native.Signum.SIGTERM, OurSignalHandler);
512 if (Environment.GetEnvironmentVariable("BEAGLE_THERE_BE_NO_QUITTIN") == null)
513 Mono.Unix.Native.Stdlib.signal (Mono.Unix.Native.Signum.SIGQUIT, OurSignalHandler);
516 static void OurSignalHandler (int signal)
518 // This allows us to call OurSignalHandler w/o doing anything.
519 // We want to call it once to ensure that it is pre-JITed.
520 if (signal < 0)
521 return;
523 Logger.Log.Debug ("Shutdown Requested");
524 shutdown = true;
527 /////////////////////////////////////////////////////////////////
529 static void PrintUsage ()
531 string usage =
532 "beagle-build-index: Build an index.\n" +
533 "Web page: http://www.gnome.org/projects/beagle\n" +
534 "Copyright (C) 2005-2006 Novell, Inc.\n\n";
536 usage +=
537 "Usage: beagle-build-index [OPTIONS] --target <index_path> <path> [path]\n\n" +
539 "** WARNING **\n" +
540 "beagle-build-index will *delete all existing data* within the target\n" +
541 "directory. Ensure that the target path is set correctly before running.\n\n" +
543 "Options:\n" +
544 " --source [name]\t\tThe index's source name. Defaults to the target directory name\n" +
545 " --remap [path1:path2]\t\tRemap data paths to fit target. \n" +
546 " --tag [tag]\t\t\tTag index data for identification.\n" +
547 " --recursive\t\t\tCrawl source path recursivly.\n" +
548 " --enable-text-cache\t\tBuild text-cache of documents used for snippets.\n" +
549 " --disable-filtering\t\tDisable all filtering of files. Only index attributes.\n" +
550 " --allow-pattern [pattern]\tOnly allow files that match the pattern to be indexed.\n" +
551 " --deny-pattern [pattern]\tKeep any files that match the pattern from being indexed.\n" +
552 " --disable-restart\t\tDon't restart when memory usage gets above a certain threshold.\n" +
553 " --debug\t\t\tEcho verbose debugging information.\n\n";
556 Console.WriteLine (usage);
557 Environment.Exit (0);
560 /////////////////////////////////////////////////////////
562 static Uri RemapUri (Uri uri)
564 // FIXME: This is ghetto
565 foreach (DictionaryEntry dict in remap_table) {
566 if (uri.LocalPath.IndexOf ((string) dict.Key) == -1)
567 continue;
568 return new Uri (uri.LocalPath.Replace ((string) dict.Key, (string) dict.Value));
570 return uri;
573 static bool Ignore (DirectoryInfo directory)
575 if (directory.Name.StartsWith ("."))
576 return true;
578 return false;
581 static bool Ignore (FileInfo file)
583 if (file.Name.StartsWith ("."))
584 return true;
586 if (FileSystem.IsSymLink (file.FullName))
587 return true;
589 if (allowed_patterns.Count > 0) {
590 foreach (ExcludeItem pattern in allowed_patterns)
591 if (pattern.IsMatch (file.Name))
592 return false;
594 return true;
597 foreach (ExcludeItem pattern in denied_patterns)
598 if (pattern.IsMatch (file.Name))
599 return true;
601 // FIXME: Add more stuff here
603 return false;