* Work toward getting static backends going again, but they're still broken.
[beagle.git] / beagled / LuceneQueryable.cs
blob1f95b2eb37fc52be87af8bf7d077b950b7d78ef4
1 //
2 // LuceneQueryable.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.IO;
31 using Beagle.Util;
33 namespace Beagle.Daemon {
35 public abstract class LuceneQueryable : BackendBase {
37 static public bool OptimizeRightAway = false;
39 public delegate IIndexer IndexerCreator (string source_name, int source_version);
41 static private IndexerCreator indexer_hook = null;
43 static public IndexerCreator IndexerHook {
44 set { indexer_hook = value; }
47 //////////////////////////////////////////////////////////
49 public delegate void OptimizeAllHandler ();
51 static private OptimizeAllHandler OptimizeAllEvent;
53 static public void OptimizeAll ()
55 if (OptimizeAllEvent != null)
56 OptimizeAllEvent ();
59 //////////////////////////////////////////////////////////
61 private Scheduler scheduler = Scheduler.Global;
62 private FileAttributesStore fa_store = null;
64 private string source_name;
66 private LuceneQueryingDriver driver;
67 private IIndexer indexer = null;
69 private Scheduler.Task our_final_flush_task = null;
70 private Scheduler.Task our_optimize_task = null;
72 private object request_lock = new object ();
73 private IndexerRequest pending_request = new IndexerRequest ();
75 //////////////////////////////////////////////////////////
77 public LuceneQueryable (string source_name) : this (source_name, -1, false) { }
79 public LuceneQueryable (string source_name, bool read_only_mode) : this (source_name, -1, read_only_mode) { }
81 public LuceneQueryable (string source_name, int source_version) : this (source_name, source_version, false) { }
83 public LuceneQueryable (string source_name, int source_version, bool read_only_mode)
85 this.source_name = source_name;
87 driver = BuildLuceneQueryingDriver (source_name, source_version, read_only_mode);
88 driver.RegisterHitFilter (source_name, this.HitFilter);
90 // If the queryable is in read-only more, don't
91 // instantiate an indexer for it.
92 if (read_only_mode)
93 return;
95 if (indexer_hook != null)
96 indexer = indexer_hook (source_name, source_version);
98 if (indexer == null)
99 throw new Exception ("No indexer available for source " + source_name);
101 OptimizeAllEvent += OnOptimizeAllEvent;
103 // Schedule an optimize, just in case
104 ScheduleOptimize ();
106 Shutdown.ShutdownEvent += new Shutdown.ShutdownHandler (OnShutdownEvent);
109 public override string Name {
110 set {
111 if (value != source_name)
112 throw new Exception (String.Format ("Backend name (from BackendFlavor) '{0}' does not match source name (from LuceneQueryable ctor) '{1}'", value, source_name));
114 base.Name = value;
118 protected string IndexDirectory {
119 get { return driver.TopDirectory; }
122 protected string IndexFingerprint {
123 get { return driver.Fingerprint; }
126 protected string SourceDataDir {
127 get { return Path.Combine (IndexDirectory, this.source_name); }
130 protected LuceneQueryingDriver Driver {
131 get { return driver; }
134 public Scheduler ThisScheduler {
135 get { return scheduler; }
138 /////////////////////////////////////////
140 public override void Start ()
145 public override IQueryable Queryable {
146 get { return driver; }
149 /////////////////////////////////////////
151 virtual protected void ShutdownHook ()
156 private void OnShutdownEvent ()
158 lock (request_lock)
159 pending_request.Cleanup ();
161 try {
162 ShutdownHook ();
163 } catch (Exception ex) {
164 Logger.Log.Warn (ex, "Caught exception in shutdown hook");
168 /////////////////////////////////////////
170 virtual protected bool HitFilter (Hit hit)
172 return true;
175 /////////////////////////////////////////
177 // DEPRECATED: This does nothing, since everything is now
178 // time-based.
179 virtual protected double RelevancyMultiplier (Hit hit)
181 return 1.0;
184 static protected double HalfLifeMultiplier (DateTime dt, int half_life_days)
186 double days = Math.Abs ((DateTime.Now - dt).TotalDays);
187 if (days < 0)
188 return 1.0f;
189 return Math.Pow (0.5, days / (double) half_life_days);
192 // FIXME: A decaying half-life is a little sketchy, since data
193 // will eventually decay beyond the epsilon and be dropped
194 // from the results entirely, which is almost never what we
195 // want, particularly in searches with a few number of
196 // results. But with a default half-life of 6 months, it'll
197 // take over 13 years to fully decay outside the epsilon on
198 // this multiplier alone.
199 static protected double HalfLifeMultiplier (DateTime time)
201 // Default relevancy half-life is six months.
202 return HalfLifeMultiplier (time, 182);
205 static protected double HalfLifeMultiplierFromProperty (Hit hit,
206 double default_multiplier,
207 params object [] properties)
209 double best_m = -1.0;
211 foreach (object obj in properties) {
212 string key = obj as string;
213 string val = hit [key];
214 if (val != null) {
215 DateTime dt = StringFu.StringToDateTime (val);
216 double this_m;
217 this_m = HalfLifeMultiplier (dt, 182); /* 182 days == six months */
218 if (this_m > best_m)
219 best_m = this_m;
223 if (best_m < 0)
224 best_m = default_multiplier;
225 return best_m;
228 /////////////////////////////////////////
230 protected string GetSnippetFromTextCache (string [] query_terms, Uri uri)
232 // Look up the hit in our text cache. If it is there,
233 // use the cached version to generate a snippet.
235 TextReader reader;
236 reader = TextCache.UserCache.GetReader (uri);
237 if (reader == null)
238 return null;
240 string snippet = SnippetFu.GetSnippet (query_terms, reader);
241 reader.Close ();
243 return snippet;
246 // When remapping, override this with
247 // return GetSnippetFromTextCache (query_terms, remapping_fn (hit.Uri))
248 public override string GetSnippet (string [] query_terms, Hit hit)
250 return GetSnippetFromTextCache (query_terms, hit.Uri);
253 /////////////////////////////////////////
255 private int progress_percent = -1;
256 private QueryableState state = QueryableState.Idle;
257 private DateTime last_state_change = DateTime.MinValue;
259 public override QueryableStatus GetBackendStatus ()
261 QueryableStatus status = new QueryableStatus ();
263 status.Name = this.Name;
264 status.State = state;
265 status.ProgressPercent = progress_percent;
267 // If we're in read-only mode, query the driver
268 // and not the indexer for the item count.
269 if (indexer == null)
270 status.ItemCount = driver.GetItemCount ();
271 else
272 status.ItemCount = indexer.GetItemCount ();
274 // Frequent state changes are common, and there isn't
275 // a real state machine with continuity when it comes
276 // to the indexing process. A delayed indexing task,
277 // for example, might not actually run for several
278 // seconds after it is scheduled. In this case, the
279 // backend might be in an "Idle" state, but the
280 // indexing process clearly isn't done. To work
281 // around this, we also track the last time the state
282 // changed. If it's less than some threshold, then
283 // we consider ourselves to still be in the process of
284 // indexing.
285 if (state != QueryableState.NotApplicable
286 && (state != QueryableState.Idle
287 || (DateTime.Now - last_state_change).TotalSeconds <= 30))
288 status.IsIndexing = true;
290 return status;
293 public QueryableState State {
294 get { return this.state; }
295 set {
296 //Logger.Log.Debug ("State {0}: {1} -> {2}", this, this.state, value);
298 this.state = value;
299 this.last_state_change = DateTime.Now;
303 public int ProgressPercent {
304 get { return this.progress_percent; }
305 set { this.progress_percent = value; }
308 /////////////////////////////////////////
310 public FileStream ReadDataStream (string name)
312 if (! Directory.Exists (SourceDataDir))
313 return null;
315 string path = Path.Combine (SourceDataDir, name);
317 if (!File.Exists (path))
318 return null;
320 return new FileStream (path, System.IO.FileMode.Open, FileAccess.Read);
323 public string ReadDataLine (string name)
325 FileStream stream = ReadDataStream (name);
327 if (stream == null)
328 return null;
330 StreamReader reader = new StreamReader (stream);
331 string line = reader.ReadLine ();
332 reader.Close ();
334 return line;
337 public FileStream WriteDataStream (string name)
339 if (! Directory.Exists (SourceDataDir))
340 Directory.CreateDirectory (SourceDataDir);
342 string path = Path.Combine (SourceDataDir, name);
344 return new FileStream (path, System.IO.FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
347 public void WriteDataLine (string name, string line)
349 if (line == null) {
350 if (! Directory.Exists (SourceDataDir))
351 return;
353 string path = Path.Combine (SourceDataDir, name);
355 if (File.Exists (path))
356 File.Delete (path);
358 return;
361 FileStream stream = WriteDataStream (name);
362 StreamWriter writer = new StreamWriter (stream);
363 writer.WriteLine (line);
364 writer.Close ();
368 //////////////////////////////////////////////////////////////////////////////////
370 // More hooks. These are mostly here for the file system backend.
372 virtual protected bool PreAddIndexableHook (Indexable indexable)
374 // By default, we like everything.
375 return true;
378 // If we are remapping Uris, indexables should be added to the
379 // index with the internal Uri attached. This the receipt
380 // will come back w/ an internal Uri. In order for change
381 // notification to work correctly, we have to map it to
382 // an external Uri.
383 virtual protected void PostAddHook (Indexable indexable, IndexerAddedReceipt receipt)
385 // Does nothing by default
388 virtual protected void PostRemoveHook (Indexable indexable, IndexerRemovedReceipt receipt)
390 // Does nothing by default
393 //////////////////////////////////////////////////////////////////////////////////
395 // Adding a single indexable
397 private class AddTask : Scheduler.Task {
398 LuceneQueryable queryable;
399 Indexable indexable;
401 public AddTask (LuceneQueryable queryable,
402 Indexable indexable)
404 this.queryable = queryable;
405 this.indexable = indexable;
406 this.Tag = indexable.DisplayUri.ToString ();
407 this.Weight = 1;
410 override protected void DoTaskReal ()
412 QueryableState old_state = queryable.State;
413 queryable.State = QueryableState.Indexing;
415 if (queryable.PreAddIndexableHook (indexable)) {
416 queryable.AddIndexable (indexable);
418 if (Priority == Scheduler.Priority.Immediate)
419 queryable.Flush ();
420 else
421 queryable.ConditionalFlush ();
424 queryable.State = old_state;
427 override protected void DoCleanup ()
429 indexable.Cleanup ();
433 public Scheduler.Task NewAddTask (Indexable indexable)
435 AddTask task;
436 task = new AddTask (this, indexable);
437 task.Source = this;
438 return task;
441 //////////////////////////////////////////////////////////////////////////////////
443 // Adding an indexable generator
445 private class AddGeneratorTask : Scheduler.Task {
446 LuceneQueryable queryable;
447 IIndexableGenerator generator;
449 public AddGeneratorTask (LuceneQueryable queryable,
450 IIndexableGenerator generator)
452 this.queryable = queryable;
453 this.generator = generator;
454 this.Tag = generator.StatusName;
457 override protected void DoTaskReal ()
459 // Since this is a generator, we want the task to
460 // get re-scheduled after it is run.
461 Reschedule = true;
463 QueryableState old_state = queryable.State;
464 queryable.State = QueryableState.Indexing;
466 // Number of times a null indexable was returned. We don't want
467 // to spin tightly in a loop here if we're not actually indexing
468 // things.
469 int misfires = 0;
471 do {
472 if (! generator.HasNextIndexable ()) {
473 // Of course, don't reschedule if there is no more work to do.
474 Reschedule = false;
475 break;
478 Indexable generated;
479 generated = generator.GetNextIndexable ();
481 // Note that the indexable generator can return null.
482 // This means that the generator didn't have an indexable
483 // to return this time through, but it does not mean that
484 // its processing queue is empty.
485 if (generated == null) {
486 misfires++;
488 if (misfires > 179) // Another totally arbitrary number
489 break;
490 else
491 continue;
494 if (queryable.PreAddIndexableHook (generated))
495 queryable.AddIndexable (generated);
496 else
497 generated.Cleanup ();
499 // We keep adding indexables until a flush goes through.
500 } while (! queryable.ConditionalFlush ());
502 generator.PostFlushHook ();
504 queryable.State = old_state;
507 override protected void DoCleanup ()
512 public Scheduler.Task NewAddTask (IIndexableGenerator generator)
514 AddGeneratorTask task;
515 task = new AddGeneratorTask (this, generator);
516 task.Source = this;
517 return task;
520 //////////////////////////////////////////////////////////////////////////////////
522 // There used to be a separate type of task for doing removes.
523 // This is all that remains of that old code.
524 public Scheduler.Task NewRemoveTask (Uri uri)
526 Indexable indexable;
527 indexable = new Indexable (IndexableType.Remove, uri);
529 return NewAddTask (indexable);
532 //////////////////////////////////////////////////////////////////////////////////
534 public Scheduler.Task NewRemoveByPropertyTask (Property prop)
536 PropertyRemovalGenerator prg = new PropertyRemovalGenerator (driver, prop);
538 return NewAddTask (prg);
541 ///////////////////////////////////////////////////////////////////////////////////
544 // An IIndexableGenerator that returns remove Indexables for
545 // all items which match a certain property
548 private class PropertyRemovalGenerator : IIndexableGenerator {
550 private LuceneQueryingDriver driver;
551 private Property prop_to_match;
552 private Uri[] uris_to_remove;
553 private int idx;
555 public PropertyRemovalGenerator (LuceneQueryingDriver driver, Property prop)
557 this.driver = driver;
558 this.prop_to_match = prop;
561 public Indexable GetNextIndexable ()
563 Indexable indexable;
565 indexable = new Indexable (IndexableType.Remove, uris_to_remove [idx]);
566 idx++;
568 return indexable;
571 public bool HasNextIndexable ()
573 if (uris_to_remove == null)
574 uris_to_remove = this.driver.PropertyQuery (this.prop_to_match);
576 if (idx < uris_to_remove.Length)
577 return true;
578 else
579 return false;
582 public string StatusName {
583 get {
584 return String.Format ("Removing {0}={1}", prop_to_match.Key, prop_to_match.Value);
588 public void PostFlushHook () { }
592 //////////////////////////////////////////////////////////////////////////////////
594 // When all other tasks are complete, we need to do a final flush.
595 // We schedule that as a maintenance task.
597 private class FinalFlushTask : Scheduler.Task {
598 LuceneQueryable queryable;
600 public FinalFlushTask (LuceneQueryable queryable)
602 this.queryable = queryable;
606 override protected void DoTaskReal ()
608 queryable.Flush ();
612 private void ScheduleFinalFlush ()
614 if (our_final_flush_task == null) {
615 our_final_flush_task = new FinalFlushTask (this);
617 our_final_flush_task.Tag = "Final Flush for " + Name;
618 our_final_flush_task.Priority = Scheduler.Priority.Maintenance;
619 our_final_flush_task.SubPriority = 100; // do this first when starting maintenance
620 our_final_flush_task.Source = this;
623 ThisScheduler.Add (our_final_flush_task);
627 //////////////////////////////////////////////////////////////////////////////////
629 // Optimize the index
631 private DateTime last_optimize_time = DateTime.MinValue;
633 public DateTime LastOptimizeTime {
634 get { return last_optimize_time; }
635 set { last_optimize_time = value; }
638 private class OptimizeTask : Scheduler.Task {
639 LuceneQueryable queryable;
641 public OptimizeTask (LuceneQueryable queryable)
643 this.queryable = queryable;
646 override protected void DoTaskReal ()
648 queryable.Optimize ();
649 queryable.LastOptimizeTime = DateTime.Now;
653 public Scheduler.Task NewOptimizeTask ()
655 Scheduler.Task task;
656 task = new OptimizeTask (this);
657 task.Tag = "Optimize " + Name;
658 task.Priority = Scheduler.Priority.Maintenance;
659 task.Source = this;
661 return task;
664 private void OnOptimizeAllEvent ()
666 Scheduler.Task task;
667 task = NewOptimizeTask (); // construct an optimizer task
668 task.Priority = Scheduler.Priority.Delayed; // but boost the priority
669 ThisScheduler.Add (task);
672 private void ScheduleOptimize ()
674 double optimize_delay;
676 // Really we only want to optimize at most once a day, even if we have
677 // indexed a ton of dat
678 TimeSpan span = DateTime.Now - last_optimize_time;
679 if (span.TotalDays > 1.0)
680 optimize_delay = 10.0; // minutes;
681 else
682 optimize_delay = (new TimeSpan (TimeSpan.TicksPerDay) - span).TotalMinutes;
684 if (our_optimize_task == null)
685 our_optimize_task = NewOptimizeTask ();
687 if (OptimizeRightAway || Environment.GetEnvironmentVariable ("BEAGLE_UNDER_BLUDGEON") != null)
688 optimize_delay = 1/120.0; // half a second
690 // Changing the trigger time of an already-scheduled process
691 // does what you would expect.
692 our_optimize_task.TriggerTime = DateTime.Now.AddMinutes (optimize_delay);
694 // Adding the same task more than once is a harmless no-op.
695 ThisScheduler.Add (our_optimize_task);
698 //////////////////////////////////////////////////////////////////////////////////
700 // Other hooks
702 // If this returns true, a task will automatically be created to
703 // add the child.
704 virtual protected bool PreChildAddHook (Indexable child)
706 return true;
709 virtual protected void PreFlushHook (IndexerRequest flushed_request)
712 virtual protected void PostFlushHook (IndexerRequest flushed_request,
713 IndexerReceipt [] receipts)
716 //////////////////////////////////////////////////////////////////////////////////
718 protected void AddIndexable (Indexable indexable)
720 indexable.Source = this.Name;
722 lock (request_lock)
723 pending_request.Add (indexable);
725 // Schedule a final flush every time we add anything.
726 // Better safe than sorry.
727 ScheduleFinalFlush ();
730 protected void Optimize ()
732 lock (request_lock) {
733 pending_request.OptimizeIndex = true;
734 Flush ();
738 // Returns true if we actually did flush, false otherwise.
739 protected bool ConditionalFlush ()
741 QueryableState old_state = State;
742 State = QueryableState.Flushing;
744 try {
745 lock (request_lock) {
746 if (pending_request.Count > 37) { // a total arbitrary magic number
747 Flush ();
748 return true;
751 return false;
752 } finally {
753 State = old_state;
757 protected void Flush ()
759 QueryableState old_state = State;
760 State = QueryableState.Flushing;
762 try {
763 DoFlush ();
764 } finally {
765 State = old_state;
769 private void DoFlush ()
771 IndexerRequest flushed_request;
773 lock (request_lock) {
774 if (pending_request.IsEmpty)
775 return;
777 flushed_request = pending_request;
778 pending_request = new IndexerRequest ();
780 // We hold the request_lock when calling PreFlushHook, so
781 // that no other requests can come in until it exits.
782 PreFlushHook (flushed_request);
785 IndexerReceipt [] receipts;
786 receipts = indexer.Flush (flushed_request);
788 PostFlushHook (flushed_request, receipts);
790 // Silently return if we get a null back. This is probably
791 // a bad thing to do.
792 if (receipts == null)
793 return;
795 // Nothing happened (except maybe an optimize, which does not
796 // generate a receipt). Also do nothing.
797 if (receipts.Length == 0)
798 return;
800 // Update the cached count of items in the driver
801 driver.SetItemCount (indexer.GetItemCount ());
803 // Something happened, so schedule an optimize just in case.
804 ScheduleOptimize ();
806 if (fa_store != null)
807 fa_store.BeginTransaction ();
809 ArrayList added_uris = new ArrayList ();
810 ArrayList removed_uris = new ArrayList ();
812 for (int i = 0; i < receipts.Length; ++i) {
814 if (receipts [i] is IndexerAddedReceipt) {
816 IndexerAddedReceipt r;
817 r = (IndexerAddedReceipt) receipts [i];
819 // Add the Uri to the list for our change data
820 // before doing any post-processing.
821 // This ensures that we have internal uris when
822 // we are remapping.
823 added_uris.Add (r.Uri);
825 // Call the appropriate hook
826 try {
827 // Map from internal->external Uris in the PostAddHook
828 PostAddHook (flushed_request.GetByUri (r.Uri), r);
829 } catch (Exception ex) {
830 Logger.Log.Warn (ex, "Caught exception in PostAddHook '{0}' '{1}' '{2}'",
831 r.Uri, r.FilterName, r.FilterVersion);
834 // Every added Uri also needs to be listed as removed,
835 // to avoid duplicate hits in the query. Since the
836 // removed Uris need to be external Uris, we add them
837 // to the list *after* post-processing.
838 removed_uris.Add (r.Uri);
841 } else if (receipts [i] is IndexerRemovedReceipt) {
843 IndexerRemovedReceipt r;
844 r = (IndexerRemovedReceipt) receipts [i];
846 // Drop the removed item from the text cache
847 TextCache.UserCache.Delete (r.Uri);
850 // Call the appropriate hook
851 try {
852 PostRemoveHook (flushed_request.GetByUri (r.Uri), r);
853 } catch (Exception ex) {
854 Logger.Log.Warn (ex, "Caught exception in PostRemoveHook '{0}'",
855 r.Uri);
858 // Add the removed Uri to the list for our
859 // change data. This will be an external Uri
860 // when we are remapping.
861 removed_uris.Add (r.Uri);
863 } else if (receipts [i] is IndexerChildIndexablesReceipt) {
865 IndexerChildIndexablesReceipt r;
866 r = (IndexerChildIndexablesReceipt) receipts [i];
868 foreach (Indexable child in r.Children) {
869 bool please_add_a_new_task = false;
871 try {
872 please_add_a_new_task = PreChildAddHook (child);
873 } catch (InvalidOperationException ex) {
874 // Queryable does not support adding children
875 } catch (Exception ex) {
876 Logger.Log.Warn (ex, "Caught exception in PreChildAddHook '{0}'", child.DisplayUri);
879 if (please_add_a_new_task) {
880 //Logger.Log.Debug ("Adding child {0}", child.Uri);
881 Scheduler.Task task = NewAddTask (child);
882 task.SubPriority = 1;
883 ThisScheduler.Add (task);
884 } else
885 child.Cleanup ();
890 if (fa_store != null)
891 fa_store.CommitTransaction ();
893 // Propagate the change notification to any open queries.
894 if (added_uris.Count > 0 || removed_uris.Count > 0)
895 driver.QueryableChanged (added_uris, removed_uris);
898 //////////////////////////////////////////////////////////////////////////////////
901 // It is often convenient to have easy access to a FileAttributeStore
904 virtual protected IFileAttributesStore BuildFileAttributesStore ()
906 if (ExtendedAttribute.Supported)
907 return new FileAttributesStore_ExtendedAttribute (IndexFingerprint);
908 else
909 return new FileAttributesStore_Sqlite (IndexDirectory, IndexFingerprint);
913 public FileAttributesStore FileAttributesStore {
914 get {
915 if (fa_store == null)
916 fa_store = new FileAttributesStore (BuildFileAttributesStore ());
917 return fa_store;
921 //////////////////////////////////////////////////////////////////////////////////
923 virtual protected LuceneQueryingDriver BuildLuceneQueryingDriver (string source_name,
924 int source_version,
925 bool read_only_mode)
927 //return new LuceneQueryingDriver (source_name, source_version, read_only_mode);
928 return LuceneQueryingDriver.Singleton;