Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / BuildIndex.cs
blob214c0e815d1fc527a014265e5c2388e533576370
1 //
2 // BuildIndex.cs
3 //
4 // Copyright (C) 2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in all
16 // copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 // SOFTWARE.
27 using System;
28 using System.IO;
29 using System.Net;
30 using System.Threading;
31 using System.Collections;
33 using System.Xml;
34 using System.Xml.Serialization;
36 using Beagle;
37 using Beagle.Util;
38 using FSQ = Beagle.Daemon.FileSystemQueryable.FileSystemQueryable;
40 namespace Beagle.Daemon
42 class BuildIndex
44 static bool arg_recursive = false, arg_debug = false, arg_cache_text = false, arg_disable_filtering = false;
46 static Hashtable remap_table = new Hashtable ();
48 static string arg_output, arg_tag;
50 /////////////////////////////////////////////////////////
52 static FileAttributesStore_Sqlite backing_fa_store;
53 static FileAttributesStore fa_store;
55 static LuceneIndexingDriver driver;
57 static bool crawling = true, shutdown = false;
59 static ArrayList allowed_patterns = new ArrayList ();
60 static ArrayList denied_patterns = new ArrayList ();
62 static Queue pending_files = new Queue ();
63 static Queue pending_directories = new Queue ();
65 const int BATCH_SIZE = 30;
67 /////////////////////////////////////////////////////////
69 static void Main (string [] args)
71 if (args.Length < 2)
72 PrintUsage ();
74 int i = 0;
75 while (i < args.Length) {
77 string arg = args [i];
78 ++i;
79 string next_arg = i < args.Length ? args [i] : null;
81 switch (arg) {
82 case "-h":
83 case "--help":
84 PrintUsage ();
85 break;
87 case "--tag":
88 if (next_arg != null)
89 arg_tag = next_arg;
90 ++i;
91 break;
93 case "-r":
94 case "--recursive":
95 arg_recursive = true;
96 break;
98 case "--enable-text-cache":
99 arg_cache_text = true;
100 break;
102 case "--remap":
103 if (next_arg == null)
104 break;
106 int j = next_arg.IndexOf (":");
108 if (j == -1) {
109 Console.WriteLine ("Invalid remap argument: {0}", next_arg);
110 Environment.Exit (1);
113 remap_table [next_arg.Substring (0, j)] = next_arg.Substring (j+1);
115 ++i;
116 break;
118 case "--target":
119 if (next_arg != null)
120 arg_output = Path.IsPathRooted (next_arg) ? next_arg : Path.GetFullPath (next_arg);
121 ++i;
122 break;
124 case "--disable-filtering":
125 arg_disable_filtering = true;
126 break;
128 case "--allow-pattern":
129 if (next_arg == null)
130 break;
132 if (next_arg.IndexOf (',') != -1) {
133 foreach (string pattern in next_arg.Split (','))
134 allowed_patterns.Add (new ExcludeItem (ExcludeType.Pattern, pattern));
136 } else {
137 allowed_patterns.Add (new ExcludeItem (ExcludeType.Pattern, next_arg));
140 ++i;
141 break;
143 case "--deny-pattern":
144 if (next_arg == null)
145 break;
147 if (next_arg.IndexOf (',') != -1) {
148 foreach (string pattern in next_arg.Split (','))
149 denied_patterns.Add (new ExcludeItem (ExcludeType.Pattern, pattern));
151 } else {
152 denied_patterns.Add (new ExcludeItem (ExcludeType.Pattern, next_arg));
155 ++i;
156 break;
158 default:
159 string path = Path.IsPathRooted (arg) ? arg : Path.GetFullPath (arg);
161 if (Directory.Exists (path))
162 pending_directories.Enqueue (new DirectoryInfo (path));
163 else if (File.Exists (path))
164 pending_files.Enqueue (new FileInfo (path));
165 break;
169 /////////////////////////////////////////////////////////
171 if (!Directory.Exists (Path.GetDirectoryName (arg_output))) {
172 Console.WriteLine ("Index directory not available for construction: {0}", arg_output);
173 Environment.Exit (1);
176 driver = new LuceneIndexingDriver (arg_output);
177 driver.TextCache = (arg_cache_text) ? new TextCache (arg_output) : null;
179 backing_fa_store = new FileAttributesStore_Sqlite (driver.TopDirectory, driver.Fingerprint);
180 fa_store = new FileAttributesStore (backing_fa_store);
182 // Set up signal handlers
183 SetupSignalHandlers ();
185 // Start the thread that does the crawling
186 ExceptionHandlingThread.Start (new ThreadStart (CrawlWorker));
188 // Start the thread that does the actual indexing
189 ExceptionHandlingThread.Start (new ThreadStart (IndexWorker));
192 /////////////////////////////////////////////////////////////////
194 static void CrawlWorker ()
196 Logger.Log.Debug ("Starting CrawlWorker");
199 int count_dirs = 0;
201 while (pending_directories.Count > 0) {
202 DirectoryInfo dir = (DirectoryInfo) pending_directories.Dequeue ();
204 try {
205 if (arg_recursive)
206 foreach (DirectoryInfo subdir in DirectoryWalker.GetDirectoryInfos (dir))
207 if (!Ignore (subdir))
208 pending_directories.Enqueue (subdir);
210 foreach (FileInfo file in DirectoryWalker.GetFileInfos (dir))
211 if (!Ignore (file))
212 pending_files.Enqueue (file);
214 } catch (DirectoryNotFoundException e) {}
216 if (shutdown)
217 break;
219 count_dirs++;
222 Logger.Log.Debug ("Scanned {0} files in {1} directories", pending_files.Count, count_dirs);
223 Logger.Log.Debug ("CrawlWorker Done");
225 crawling = false;
228 /////////////////////////////////////////////////////////////////
230 static IndexerReceipt [] FlushIndexer (IIndexer indexer)
232 IndexerReceipt [] receipts;
233 receipts = indexer.FlushAndBlock ();
236 foreach (IndexerReceipt raw_r in receipts) {
238 if (raw_r is IndexerAddedReceipt) {
239 // Update the file attributes
240 IndexerAddedReceipt r = (IndexerAddedReceipt) raw_r;
242 string path = r.Uri.LocalPath;
244 FileAttributes attr;
245 attr = fa_store.ReadOrCreate (path);
247 attr.LastWriteTime = FileSystem.GetLastWriteTime (path);
248 attr.FilterName = r.FilterName;
249 attr.FilterVersion = r.FilterVersion;
251 fa_store.Write (attr);
253 } else if (raw_r is IndexerChildIndexablesReceipt) {
254 // Add any child indexables back into our indexer
255 IndexerChildIndexablesReceipt r = (IndexerChildIndexablesReceipt) raw_r;
256 foreach (Indexable i in r.Children)
257 indexer.Add (i);
261 return receipts;
264 static void IndexWorker ()
266 Logger.Log.Debug ("Starting IndexWorker");
268 Indexable indexable;
269 int pending_adds = 0;
271 while (!shutdown) {
272 if (pending_files.Count > 0) {
273 FileInfo file = (FileInfo) pending_files.Dequeue ();
274 Uri uri = UriFu.PathToFileUri (file.FullName);
276 // Check that we really should be indexing the file
277 if (!file.Exists || Ignore (file) || fa_store.IsUpToDate (file.FullName))
278 continue;
280 // Create the indexable and add the standard properties we
281 // use in the FileSystemQueryable.
282 indexable = new Indexable (uri);
283 FSQ.AddStandardPropertiesToIndexable (indexable, file.Name, Guid.Empty, false);
285 // Disable filtering and only index file attributes
286 if (arg_disable_filtering)
287 indexable.Filtering = IndexableFiltering.Never;
289 // Tag the item for easy identification (for say, removal)
290 if (arg_tag != null)
291 indexable.AddProperty (Property.NewKeyword("Tag", arg_tag));
293 driver.Add (indexable);
294 ++pending_adds;
296 if (pending_adds % BATCH_SIZE == 0) {
297 Logger.Log.Debug ("Flushing driver, {0} items in queue", pending_files.Count);
298 FlushIndexer (driver);
299 pending_adds = 0;
301 } else if (crawling) {
302 //Logger.Log.Debug ("IndexWorker: La la la...");
303 Thread.Sleep (50);
304 } else {
305 break;
309 // Call Flush one last time.
310 // This should be a totally safe no-op if there are no pending operations.
311 // FIXME: This is incorrect. We will drop any children in the final flush.
312 FlushIndexer (driver);
314 backing_fa_store.Flush ();
316 Logger.Log.Debug ("IndexWorker Done");
319 /////////////////////////////////////////////////////////////////
321 // From BeagleDaemon.cs
323 // The integer values of the Mono.Posix.Signal enumeration don't actually
324 // match the Linux signal numbers of Linux. Oops!
325 // This is fixed in Mono.Unix, but for the moment we want to maintain
326 // compatibility with mono 1.0.x.
327 const int ACTUAL_LINUX_SIGINT = 2;
328 const int ACTUAL_LINUX_SIGQUIT = 3;
329 const int ACTUAL_LINUX_SIGTERM = 15;
331 static void SetupSignalHandlers ()
333 // Force OurSignalHandler to be JITed
334 OurSignalHandler (-1);
336 // Set up our signal handler
337 Mono.Posix.Syscall.sighandler_t sig_handler;
338 sig_handler = new Mono.Posix.Syscall.sighandler_t (OurSignalHandler);
339 Mono.Posix.Syscall.signal (ACTUAL_LINUX_SIGINT, sig_handler);
340 Mono.Posix.Syscall.signal (ACTUAL_LINUX_SIGQUIT, sig_handler);
341 Mono.Posix.Syscall.signal (ACTUAL_LINUX_SIGTERM, sig_handler);
344 static void OurSignalHandler (int signal)
346 // This allows us to call OurSignalHandler w/o doing anything.
347 // We want to call it once to ensure that it is pre-JITed.
348 if (signal < 0)
349 return;
351 Logger.Log.Debug ("Shutdown Requested");
352 shutdown = true;
355 /////////////////////////////////////////////////////////////////
357 static void PrintUsage ()
359 string usage =
360 "beagle-build-index: Build an index.\n" +
361 "Web page: http://www.gnome.org/projects/beagle\n" +
362 "Copyright (C) 2005 Novell, Inc.\n\n";
364 usage +=
365 "Usage: beagle-build-index [OPTIONS] --target <index_path> <path> [path]\n\n" +
366 "Options:\n" +
367 " --remap [path1:path2]\t\tRemap data paths to fit target. \n" +
368 " --tag [tag]\t\t\tTag index data for identification.\n" +
369 " --recursive\t\t\tCrawl source path recursivly.\n" +
370 " --enable-text-cache\t\tBuild text-cache of documents used for snippets.\n" +
371 " --disable-filtering\t\tDisable all filtering of files. Only index attributes.\n" +
372 " --allow-pattern [pattern]\tOnly allow files that match the pattern to be indexed.\n" +
373 " --deny-pattern [pattern]\tKeep any files that match the pattern from being indexed.\n" +
374 " --debug\t\t\tEcho verbose debugging information.\n";
376 Console.WriteLine (usage);
377 Environment.Exit (0);
380 /////////////////////////////////////////////////////////
382 static Uri RemapUri (Uri uri)
384 // FIXME: This is ghetto
385 foreach (DictionaryEntry dict in remap_table) {
386 if (uri.LocalPath.IndexOf ((string) dict.Key) == -1)
387 continue;
388 return new Uri (uri.LocalPath.Replace ((string) dict.Key, (string) dict.Value));
390 return uri;
393 static bool Ignore (DirectoryInfo directory)
395 if (directory.Name.StartsWith ("."))
396 return true;
398 return false;
401 static bool Ignore (FileInfo file)
403 if (file.Name.StartsWith ("."))
404 return true;
406 if (FileSystem.IsSymLink (file.FullName))
407 return true;
409 if (allowed_patterns.Count > 0) {
410 foreach (ExcludeItem pattern in allowed_patterns)
411 if (pattern.IsMatch (file.Name))
412 return false;
414 return true;
417 foreach (ExcludeItem pattern in denied_patterns)
418 if (pattern.IsMatch (file.Name))
419 return true;
421 // FIXME: Add more stuff here
423 return false;