Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / LuceneDriver.cs
blob4f2d606d700aeda9fb9e652f0fe50a0317c882b6
1 //
2 // LuceneDriver.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneDriver : IIndexer {
55 public delegate bool UriFilter (Uri uri);
56 public delegate Uri UriRemapper (Uri uri);
57 public delegate Hit HitProcessor (Hit hit);
58 public delegate double RelevancyMultiplier (Hit hit);
60 public event IIndexerChangedHandler ChangedEvent;
61 public event IIndexerChildIndexableHandler ChildIndexableEvent;
62 public event IIndexerUrisFilteredHandler UrisFilteredEvent;
64 /////////////////////////////////////////////////////
66 // 1: Original
67 // 2: Changed format of timestamp strings
68 // 3: Schema changed to be more Dashboard-Match-like
69 // 4: Schema changed for files to include _Directory property
70 // 5: Changed analyzer to support stemming. Bumped version # to
71 // force everyone to re-index.
72 // 6: lots of schema changes as part of the general refactoring
73 // 7: incremented to force a re-index after our upgrade to lucene 1.4
74 // (in theory the file formats are compatible, we are seeing 'term
75 // out of order' exceptions in some cases)
76 // 8: another forced re-index, this time because of massive changes
77 // in the file system backend (it would be nice to have per-backend
78 // versioning so that we didn't have to purge all indexes just
79 // because one changed)
80 // 9: changed the way properties are stored, changed in conjunction
81 // with sane handling of multiple properties on hits.
82 private const int MAJOR_VERSION = 9;
83 private int minor_version = 0;
85 private string top_dir;
86 private Hashtable pending_by_uri = UriFu.NewHashtable ();
87 private int pending_adds = 0;
88 private int pending_removals = 0;
89 private int cycles_since_last_optimization = 0;
90 private bool optimizing = false;
91 private int last_item_count = -1;
93 private TextCache text_cache = TextCache.UserCache;
95 public TextCache TextCache {
96 get { return text_cache; }
97 set { text_cache = value; }
100 /////////////////////////////////////////////////////
102 public LuceneDriver (string index_name) : this (index_name, -1, false) { }
104 public LuceneDriver (string index_name, string index_dir) : this (index_name, -1, false) { }
106 public LuceneDriver (string index_name, bool disable_locking) : this (index_name, -1, disable_locking) { }
108 public LuceneDriver (string index_name, int index_version) : this (index_name, index_version, false) { }
110 public LuceneDriver (string index_name, int index_version, bool disable_locking)
112 Setup (index_name, index_version, disable_locking);
115 public string IndexDirectory {
116 get { return top_dir; }
119 /////////////////////////////////////////////////////
122 // The Lucene Store
125 private Lucene.Net.Store.Directory ourStore = null;
126 private string ourStorePath = null;
129 public Lucene.Net.Store.Directory Store {
130 get { return ourStore; }
133 public string StorePath {
134 get { return ourStorePath; }
137 /////////////////////////////////////////////////////
139 private void Setup (string index_name, int minor_version, bool disable_locking)
141 top_dir = Path.Combine (PathFinder.IndexDir, index_name);
143 string versionFile = Path.Combine (top_dir, "version");
144 string fingerprintFile = Path.Combine (top_dir, "fingerprint");
145 string lockDir = Path.Combine (top_dir, "Locks");
146 string indexDir = Path.Combine (top_dir, "Index");
147 string indexTestFile = Path.Combine (indexDir, "segments");
149 bool versionExists = File.Exists (versionFile);
150 bool fingerprintExists = File.Exists (fingerprintFile);
151 bool indexExists = File.Exists (indexTestFile);
153 // Check the index's version number. If it is wrong,
154 // declare the index non-existent.
155 if (versionExists && indexExists) {
156 StreamReader sr = new StreamReader (versionFile);
157 string versionStr = sr.ReadLine ();
158 sr.Close ();
160 int old_major_version, old_minor_version;
161 int i = versionStr.IndexOf (".");
163 if (i != -1) {
164 old_major_version = Convert.ToInt32 (versionStr.Substring (0,i));
165 old_minor_version = Convert.ToInt32 (versionStr.Substring (i+1));
166 } else {
167 old_major_version = Convert.ToInt32 (versionStr);
168 old_minor_version = 0;
171 if (minor_version >= 0 && (old_major_version != MAJOR_VERSION || old_minor_version != minor_version)) {
172 log.Debug ("Version mismatch in {0}", index_name);
173 log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
174 old_major_version, old_minor_version,
175 MAJOR_VERSION, minor_version);
176 indexExists = false;
180 // If there is no fingerprint file, declare the index
181 // non-existent.
182 if (indexExists && ! fingerprintExists)
183 indexExists = false;
185 // If the index seems to exist but contains dangling locks,
186 // declare the index non-existent.
188 if (indexExists) {
189 DirectoryInfo lockDirInfo = new DirectoryInfo (lockDir);
190 if (! lockDirInfo.Exists)
191 indexExists = false;
192 else {
193 foreach (FileInfo info in lockDirInfo.GetFiles ()) {
194 if (info.Name.IndexOf ("write.lock") != -1) {
195 indexExists = false;
196 break;
199 if (! indexExists)
200 log.Debug ("Found dangling locks in {0}", lockDir);
204 if (indexExists) {
205 // Read in the fingerprint
206 StreamReader sr = new StreamReader (fingerprintFile);
207 fingerprint = sr.ReadLine ();
208 sr.Close ();
209 } else {
210 // We can't create the index if we are in
211 // read-only mode obviously.
212 if (disable_locking)
213 throw new InvalidOperationException ("LuceneDriver is in read-only mode, but requires index-creation");
215 //if (minor_version < 0)
216 // throw new InvalidOperationException ("LuceneDriver was passed an unspecified minor_version, and index-creation is required.");
218 // FIXME: This isnt exactly the way to do it
219 if (minor_version < 0)
220 minor_version = 0;
222 // Purge and rebuild the index's directory
223 // structure.
225 if (Directory.Exists (top_dir)) {
226 log.Debug ("Purging {0}", top_dir);
227 Directory.Delete (top_dir, true);
230 // Create all directories.
231 Directory.CreateDirectory (top_dir);
232 Directory.CreateDirectory (lockDir);
233 Directory.CreateDirectory (indexDir);
235 StreamWriter sw;
237 // Generate a fingerprint and write it out
238 fingerprint = Guid.NewGuid ().ToString ();
239 sw = new StreamWriter (fingerprintFile, false);
240 sw.WriteLine (fingerprint);
241 sw.Close ();
243 // Write out our version information
244 sw = new StreamWriter (versionFile, false);
245 sw.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
246 sw.Close ();
249 Lucene.Net.Store.FSDirectory store;
251 store = Lucene.Net.Store.FSDirectory.GetDirectory (indexDir, lockDir, false, disable_locking);
253 ourStore = store;
254 ourStorePath = indexDir;
256 //Store = store;
258 if (!indexExists) {
259 // This creates the index if it doesn't exist
260 IndexWriter writer = new IndexWriter (Store, null, true);
261 writer.Close ();
264 if (Environment.GetEnvironmentVariable ("BEAGLE_OPTIMIZE_ON_STARTUP") != null) {
265 cycles_since_last_optimization = 1000; // this can't be zero, or nothing will happen
266 Optimize ();
270 /////////////////////////////////////////////////////
273 // The log
276 private static Logger log = Logger.Get ("lucene");
278 private static Logger Log {
279 get { return log; }
282 /////////////////////////////////////////////////////
285 // The Index's Fingerprint and up-to-date checking
288 private string fingerprint = null;
290 public string Fingerprint {
291 get { return fingerprint; }
294 /////////////////////////////////////////////////////
297 // Public Indexing API
300 static object [] empty_collection = new object [0];
302 public void Add (Indexable indexable)
304 Uri uri = indexable.Uri;
306 lock (pending_by_uri) {
307 if (pending_by_uri.Contains (uri) && pending_by_uri [uri] == null)
308 --pending_removals;
309 pending_by_uri [uri] = indexable;
310 ++pending_adds;
314 public void Remove (Uri uri)
316 lock (pending_by_uri) {
317 if (pending_by_uri [uri] != null)
318 --pending_adds;
319 pending_by_uri [uri] = null;
320 ++pending_removals;
324 public void Rename (Uri old_uri, Uri new_uri)
326 Logger.Log.Error ("**** LuceneDriver does not support Rename!");
327 Logger.Log.Error ("**** old_uri={0}", old_uri);
328 Logger.Log.Error ("**** new_uri={0}", new_uri);
331 public int PendingAdds {
332 get { return pending_adds; }
335 public int PendingRemovals {
336 get { return pending_removals; }
339 public void Flush ()
341 ArrayList pending_uris;
342 ArrayList pending_indexables;
344 ArrayList added_uris;
345 ArrayList removed_uris;
346 ArrayList filtered_uris;
348 lock (pending_by_uri) {
350 if (pending_by_uri.Count == 0)
351 return;
353 pending_uris = new ArrayList ();
354 pending_indexables = new ArrayList ();
355 added_uris = new ArrayList ();
356 removed_uris = new ArrayList ();
357 filtered_uris = new ArrayList ();
359 // Move our indexables and remove requests out of the
360 // hash and into local data structures.
361 foreach (DictionaryEntry entry in pending_by_uri) {
362 Uri uri = (Uri) entry.Key;
363 Indexable indexable = (Indexable) entry.Value;
365 pending_uris.Add (uri);
366 if (indexable != null)
367 pending_indexables.Add (indexable);
369 if (indexable != null)
370 added_uris.Add (uri);
371 else
372 removed_uris.Add (uri);
375 pending_adds = 0;
376 pending_removals = 0;
377 pending_by_uri.Clear ();
380 int add_count = 0;
381 int removal_count = 0;
383 Log.Debug ("Flushing...");
385 Stopwatch watch = new Stopwatch ();
387 // Step #1: Delete all items with the same URIs
388 // as our pending items from the index.
389 watch.Restart ();
390 IndexReader reader = IndexReader.Open (Store);
391 foreach (Uri uri in pending_uris) {
392 log.Debug ("- {0}", uri);
394 Term term = new Term ("Uri", uri.ToString ());
395 reader.Delete (term);
396 ++removal_count;
398 term = new Term ("ParentUri", uri.ToString ());
399 removal_count += reader.Delete (term);
401 last_item_count = reader.NumDocs ();
402 reader.Close ();
403 watch.Stop ();
404 Log.Debug ("Lucene Delete: {0} {1} {2}", watch, pending_uris.Count,
405 pending_uris.Count / watch.ElapsedTime);
408 // Step #2: Write out the pending adds
409 watch.Restart ();
410 IndexWriter writer = null;
412 // FIXME: Change this to a queue so that items
413 // that are added while we flush (like children) also
414 // gets committed.
416 foreach (Indexable indexable in pending_indexables) {
418 Log.Debug ("+ {0}", indexable.DisplayUri);
420 Filter filter = null;
422 try {
423 FilterFactory.FilterIndexable (indexable, this.text_cache, out filter);
424 } catch (Exception e) {
425 Log.Error ("Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
426 Log.Error (e);
429 Document doc = null;
430 try {
431 doc = ToLuceneDocument (indexable);
432 } catch (Exception e) {
433 Log.Error ("Unable to convert {0} (type={1}) to a lucene document",
434 indexable.Uri, indexable.Type);
435 Log.Error (e);
438 if (doc != null) {
439 if (writer == null)
440 writer = new IndexWriter (Store, Analyzer, false);
441 writer.AddDocument (doc);
442 ++last_item_count;
443 ++add_count;
446 if (filter != null) {
447 filtered_uris.Add (FilteredStatus.New (indexable, filter));
450 if (filter != null && filter.ChildIndexables.Count > 0) {
451 // Iterate across any indexables created by the
452 // filter and set up the parent-child relationship.
453 foreach (Indexable child in filter.ChildIndexables)
454 child.SetChildOf (indexable);
456 // If nobody is listening for ChildIndexableEvent,
457 // just add them ourselves so we wont loose them
458 if (ChildIndexableEvent != null)
459 ChildIndexableEvent ((Indexable[]) filter.ChildIndexables.ToArray (typeof (Indexable)));
460 else
461 foreach (Indexable child in filter.ChildIndexables)
462 Add (child);
465 if (writer != null)
466 writer.Close ();
467 watch.Stop ();
468 Log.Debug ("Lucene Add: {0} {1} {2}", watch, pending_indexables.Count,
469 pending_indexables.Count / watch.ElapsedTime);
472 // Step #3: Fire off an event telling what we just did.
473 if (ChangedEvent != null) {
474 ChangedEvent (this, added_uris, removed_uris, empty_collection);
477 if (filtered_uris.Count > 0 && UrisFilteredEvent != null) {
478 UrisFilteredEvent ((FilteredStatus[]) filtered_uris.ToArray (typeof (FilteredStatus)));
481 lock (pending_by_uri)
482 cycles_since_last_optimization++;
484 if (NeedsOptimize)
485 Optimize ();
488 private bool NeedsOptimize {
489 get {
490 // FIXME: 19 is a totally arbitrary number.
491 return cycles_since_last_optimization > 19;
495 private void Optimize ()
497 // If nothing has happened since our last optimization,
498 // do dothing.
499 // If this index is already being optimized, don't
500 // optimize it again.
501 lock (pending_by_uri) {
502 if (optimizing || cycles_since_last_optimization == 0)
503 return;
504 optimizing = true;
507 Log.Debug ("Optimizing {0}...", StorePath);
509 Stopwatch watch = new Stopwatch ();
510 watch.Start ();
512 IndexWriter writer = new IndexWriter (Store, null, false);
513 writer.Optimize ();
514 writer.Close ();
516 watch.Stop ();
518 Log.Debug ("Optimization time for {0}: {1}", StorePath, watch);
520 lock (pending_by_uri) {
521 optimizing = false;
522 cycles_since_last_optimization = 0;
526 /////////////////////////////////////////////////////
528 // Returns the lowest matching score before the results are
529 // truncated.
530 public void DoQuery (Query query,
531 IQueryResult result,
532 ICollection search_subset, // should be internal uris
533 ICollection bonus_uris, // should be internal uris
534 UriFilter uri_filter,
535 UriRemapper uri_remapper, // map to external uris
536 HitProcessor hit_processor, // post-process hits
537 RelevancyMultiplier relevancy_multiplier)
539 double t_lucene;
540 double t_assembly;
542 LNS.Query lucene_query = ToLuceneQuery (query, search_subset, bonus_uris);
543 if (lucene_query == null)
544 return;
546 Stopwatch sw = new Stopwatch ();
547 sw.Start ();
548 LNS.Searcher searcher = new LNS.IndexSearcher (Store);
549 LNS.Hits hits = searcher.Search (lucene_query);
550 sw.Stop ();
552 t_lucene = sw.ElapsedTime;
554 //////////////////////////////////////
556 sw.Reset ();
557 sw.Start ();
559 int n_hits = hits.Length ();
560 if (n_hits == 0)
561 return;
563 for (int i = 0; i < n_hits; ++i) {
564 Document doc = hits.Doc (i);
566 if (uri_filter != null) {
567 Uri uri = UriFromLuceneDoc (doc);
568 if (! uri_filter (uri))
569 continue;
572 double score = (double) hits.Score (i);
574 if (result.WillReject (score)) {
575 log.Debug ("Terminating DoQuery at {0} of {1} (score={2})", i, n_hits, score);
576 break;
579 Hit hit = FromLuceneDocToHit (doc, hits.Id (i), score);
580 if (uri_remapper != null)
581 hit.Uri = uri_remapper (hit.Uri);
583 if (hit_processor != null)
584 hit = hit_processor (hit);
586 if (relevancy_multiplier != null) {
587 double m = relevancy_multiplier (hit);
588 hit.ScoreMultiplier = (float) m;
591 result.Add (hit);
594 sw.Stop ();
596 t_assembly = sw.ElapsedTime;
598 //////////////////////////////////////
600 searcher.Close ();
602 log.Debug ("{0}: n_hits={1} lucene={2:0.00}s assembly={3:0.00}s",
603 StorePath, n_hits, t_lucene, t_assembly);
606 // FIXME: This should support Uri filtering, Uri remapping, etc.
607 public ICollection DoQueryByUri (ICollection list_of_uris)
609 LNS.BooleanQuery uri_query = new LNS.BooleanQuery ();
610 LNS.Searcher searcher;
611 LNS.Hits lucene_hits;
612 ArrayList all_hits = new ArrayList ();
614 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
615 int clause_count = 0;
617 foreach (Uri uri in list_of_uris) {
618 Term term = new Term ("Uri", uri.ToString ());
619 LNS.Query term_query = new LNS.TermQuery (term);
620 uri_query.Add (term_query, false, false);
621 ++clause_count;
623 if (clause_count == max_clauses) {
624 searcher = new LNS.IndexSearcher (Store);
625 lucene_hits = searcher.Search (uri_query);
626 int n_hits = lucene_hits.Length ();
628 for (int i = 0; i < n_hits; ++i) {
629 Hit hit = FromLuceneDocToHit (lucene_hits.Doc (i),
630 lucene_hits.Id (i),
631 lucene_hits.Score (i));
632 all_hits.Add (hit);
635 searcher.Close ();
636 uri_query = new LNS.BooleanQuery ();
637 clause_count = 0;
641 if (clause_count > 0) {
642 searcher = new LNS.IndexSearcher (Store);
643 lucene_hits = searcher.Search (uri_query);
644 int n_hits = lucene_hits.Length ();
646 for (int i = 0; i < n_hits; ++i) {
647 Hit hit = FromLuceneDocToHit (lucene_hits.Doc (i),
648 lucene_hits.Id (i),
649 lucene_hits.Score (i));
650 all_hits.Add (hit);
653 searcher.Close ();
656 return all_hits;
659 public ICollection DoQueryByUri (Uri uri)
661 return DoQueryByUri (new Uri[1] { uri });
664 // We cache the number of documents in the index when readers are
665 // available, so calls to GetItemCount will return immediately
666 // if the driver has been flushed or queried.
667 public int GetItemCount ()
669 if (last_item_count < 0) {
670 IndexReader reader = IndexReader.Open (Store);
671 last_item_count = reader.NumDocs ();
672 reader.Close ();
674 return last_item_count;
678 ///////////////////////////////////////////////////////////////////////////////////////
681 // Code to map to/from Lucene data types
684 static char PropertyTypeToCode (PropertyType type)
686 switch (type) {
687 case PropertyType.Text: return '_';
688 case PropertyType.Keyword: return 'k';
689 case PropertyType.Date: return 'd';
692 throw new Exception ("Unknown property type: " + type);
695 static PropertyType CodeToPropertyType (char c)
697 switch (c) {
698 case '_': return PropertyType.Text;
699 case 'k': return PropertyType.Keyword;
700 case 'd': return PropertyType.Date;
703 throw new Exception ("Unknown property code: " + c);
706 static string PropertyTypeToWildcardField (PropertyType type)
708 switch (type) {
709 case PropertyType.Text: return "PropertyText";
710 case PropertyType.Keyword: return "PropertyKeyword";
711 case PropertyType.Date: return "PropertyDate";
714 return null;
717 private void AddPropertyToLuceneDocument (Document doc, Property prop)
719 if (prop.Value == null)
720 return;
722 Field f;
724 bool is_text;
725 is_text = prop.Type == PropertyType.Text;
727 if (prop.IsSearched) {
728 string wildcard_field;
729 wildcard_field = PropertyTypeToWildcardField (prop.Type);
730 if (wildcard_field != null) {
731 f = new Field (wildcard_field,
732 prop.Value,
733 false, // never stored
734 true, // always index
735 is_text); // only tokenize text
736 doc.Add (f);
740 f = new Field (String.Format ("prop:{0}:{1}",
741 PropertyTypeToCode (prop.Type),
742 prop.Key),
743 prop.Value,
744 true, // always store
745 true, // always index
746 is_text); // only tokenize text
747 doc.Add (f);
750 private Property FieldToProperty (Field field)
752 string name = field.Name ();
753 if (name.Length < 7 || ! name.StartsWith ("prop:"))
754 return null;
756 string key = name.Substring (7);
757 string value = field.StringValue ();
759 if (name [5] == 'k')
760 return Property.NewKeyword (key, value);
761 else if (name [5] == 'd')
762 return Property.NewDateFromString (key, value);
763 else if (name [6] == 's')
764 return Property.NewUnsearched (key, value);
765 else
766 return Property.New (key, value);
769 private Document ToLuceneDocument (Indexable indexable)
771 Document doc = new Document ();
772 Field f;
773 String str;
774 TextReader reader;
776 // First we add the Indexable's 'canonical' properties
777 // to the Document.
779 f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
780 doc.Add (f);
782 f = Field.Keyword ("Type", indexable.Type);
783 doc.Add (f);
785 if (indexable.ParentUri != null) {
786 f = Field.Keyword ("ParentUri", UriFu.UriToSerializableString (indexable.ParentUri));
787 doc.Add (f);
790 if (indexable.MimeType != null) {
791 f = Field.Keyword ("MimeType", indexable.MimeType);
792 doc.Add (f);
795 if (indexable.ValidTimestamp) {
796 str = StringFu.DateTimeToString (indexable.Timestamp);
797 f = Field.Keyword ("Timestamp", str);
798 doc.Add (f);
801 if (! indexable.NoContent) {
803 reader = indexable.GetTextReader ();
804 if (reader != null) {
805 f = Field.Text ("Text", reader);
806 doc.Add (f);
809 reader = indexable.GetHotTextReader ();
810 if (reader != null) {
811 f = Field.Text ("HotText", reader);
812 doc.Add (f);
816 foreach (Property prop in indexable.Properties)
817 AddPropertyToLuceneDocument (doc, prop);
819 return doc;
822 static public LNS.Query ToUriQuery (ICollection list_of_uris, UriRemapper remapper)
824 if (list_of_uris == null || list_of_uris.Count == 0)
825 return null;
827 LNS.BooleanQuery query = new LNS.BooleanQuery ();
828 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
829 int clause_count = 0;
831 foreach (Uri original_uri in list_of_uris) {
832 Uri uri = original_uri;
833 if (remapper != null)
834 uri = remapper (uri);
835 //Logger.Log.Debug ("ToUriQuery: {0} => {1}", original_uri, uri);
836 Term term = new Term ("Uri", uri.ToString ()); // FIXME: Do we need some UriFu here?
837 LNS.Query term_query = new LNS.TermQuery (term);
838 query.Add (term_query, false, false);
839 ++clause_count;
840 // If we have to many clases, nest the queries
841 if (clause_count == max_clauses) {
842 LNS.BooleanQuery new_query = new LNS.BooleanQuery ();
843 new_query.Add (query, false, false);
844 query = new_query;
845 clause_count = 1;
849 return query;
852 static public LNS.Query ToUriQuery (ICollection list_of_uris)
854 return ToUriQuery (list_of_uris, null);
857 static private LNS.Query NewTokenizedQuery (string field, string text)
859 ArrayList tokens = new ArrayList ();
861 // Use the analyzer to extract the query's tokens.
862 // This code is taken from Lucene's query parser.
863 // We use the standard Analyzer.
864 TokenStream source = LuceneDriver.Analyzer.TokenStream (field, new StringReader (text));
865 while (true) {
866 Lucene.Net.Analysis.Token t;
867 try {
868 t = source.Next ();
869 } catch (IOException) {
870 t = null;
872 if (t == null)
873 break;
874 tokens.Add (t.TermText ());
876 try {
877 source.Close ();
878 } catch (IOException) {
879 // ignore
882 LNS.Query q = null;
883 if (tokens.Count == 1) {
884 Term t = new Term (field, (string) tokens [0]);
885 q = new LNS.TermQuery (t);
886 } else if (tokens.Count > 1) {
887 q = new LNS.PhraseQuery ();
888 foreach (string tokenStr in tokens) {
889 Term t = new Term (field, tokenStr);
890 ((LNS.PhraseQuery) q).Add (t);
894 return q;
897 // search_subset limits the score of our search to that set of Uris
898 // bonus_uris are always matched by the query
899 private LNS.Query ToLuceneQuery (Query query,
900 ICollection search_subset,
901 ICollection bonus_uris)
903 LNS.BooleanQuery body_query = null;
904 LNS.Query search_subset_query = null;
905 LNS.Query bonus_uris_query = null;
906 LNS.BooleanQuery mime_type_query = null;
907 LNS.BooleanQuery hit_type_query = null;
909 body_query = new LNS.BooleanQuery ();
911 bool used_any_part = false;
913 foreach (QueryPart part in query.Parts) {
915 LNS.BooleanQuery part_query = new LNS.BooleanQuery ();
916 LNS.Query part_query_override = null;
917 LNS.Query subquery = null;
919 bool used_this_part = false;
921 if (part.TargetIsAll || part.TargetIsText) {
923 subquery = NewTokenizedQuery ("Text", part.Text);
924 if (subquery != null) {
925 part_query.Add (subquery, false, false);
926 used_this_part = true;
929 subquery = NewTokenizedQuery ("HotText", part.Text);
930 if (subquery != null) {
931 subquery.SetBoost (1.75f);
932 part_query.Add (subquery, false, false);
933 used_this_part = true;
937 if (part.TargetIsAll || part.TargetIsProperties) {
938 subquery = NewTokenizedQuery ("PropertyText", part.Text);
939 if (subquery != null) {
940 subquery.SetBoost (1.75f);
941 part_query.Add (subquery, false, false);
942 used_this_part = true;
946 if (part.TargetIsSpecificProperty) {
948 string prop_name;
949 prop_name = String.Format ("prop:{0}:{1}",
950 part.IsKeyword ? 'k' : '_',
951 part.Target);
953 if (part.IsKeyword) {
954 Term term = new Term (prop_name, part.Text);
955 subquery = new LNS.TermQuery (term);
956 } else {
957 subquery = NewTokenizedQuery (prop_name, part.Text);
960 // Instead of the boolean query, just use the subquery.
961 if (subquery != null) {
962 part_query_override = subquery;
963 used_this_part = true;
967 if (used_this_part) {
968 if (part_query_override == null)
969 part_query_override = part_query;
970 body_query.Add (part_query_override, part.IsRequired, part.IsProhibited);
971 used_any_part = true;
975 if (! used_any_part)
976 return null;
978 search_subset_query = ToUriQuery (search_subset, null);
980 bonus_uris_query = ToUriQuery (bonus_uris, null);
982 if (query.MimeTypes.Count > 0) {
983 mime_type_query = new LNS.BooleanQuery ();
984 foreach (string mime_type in query.MimeTypes) {
985 Term t = new Term ("MimeType", mime_type);
986 LNS.Query q = new LNS.TermQuery (t);
987 mime_type_query.Add (q, false, false);
991 if (query.HasHitTypes) {
992 hit_type_query = new LNS.BooleanQuery ();
993 foreach (string hit_type in query.HitTypes) {
994 Term t = new Term ("Type", hit_type);
995 LNS.Query q = new LNS.TermQuery (t);
996 hit_type_query.Add (q, false, false);
1001 // Now we combine the various parts into one big query.
1004 LNS.BooleanQuery total_query = new LNS.BooleanQuery ();
1006 // If we have hit types or mime types, those must be matched
1007 if (mime_type_query != null)
1008 total_query.Add (mime_type_query, true, false);
1009 if (hit_type_query != null)
1010 total_query.Add (hit_type_query, true, false);
1012 // We also must match the "content query":
1013 // (body_query OR bonus_uris_query) AND search_subset_query
1015 LNS.Query content_query = null;
1017 if (body_query != null && bonus_uris_query != null) {
1018 LNS.BooleanQuery q = new LNS.BooleanQuery ();
1019 q.Add (body_query, false, false);
1020 q.Add (bonus_uris_query, false, false);
1021 content_query = q;
1022 } else if (body_query != null) {
1023 content_query = body_query;
1024 } else if (bonus_uris_query != null) {
1025 content_query = bonus_uris_query;
1028 if (content_query != null && search_subset_query != null) {
1029 LNS.BooleanQuery q = new LNS.BooleanQuery ();
1030 q.Add (content_query, true, false);
1031 q.Add (search_subset_query, true, false);
1032 content_query = q;
1033 } else if (search_subset_query != null) {
1034 content_query = search_subset_query;
1037 if (content_query != null)
1038 total_query.Add (content_query, true, false);
1040 return total_query;
1043 static private Uri UriFromLuceneDoc (Document doc)
1045 string uri = doc.Get ("Uri");
1046 if (uri == null)
1047 throw new Exception ("Got document from Lucene w/o a URI!");
1048 return UriFu.UriStringToUri (uri);
1051 static private void FromLuceneDocToVersioned (Document doc, Versioned versioned)
1053 string str;
1055 str = doc.Get ("Timestamp");
1056 if (str != null)
1057 versioned.Timestamp = StringFu.StringToDateTime (str);
1060 private Hit FromLuceneDocToHit (Document doc, int id, double score)
1062 Hit hit = new Hit ();
1064 hit.Id = id;
1066 string str;
1068 FromLuceneDocToVersioned (doc, hit);
1070 hit.Uri = UriFromLuceneDoc (doc);
1072 str = doc.Get ("Type");
1073 if (str == null)
1074 throw new Exception ("Got hit from Lucene w/o a Type!");
1075 hit.Type = str;
1077 str = doc.Get ("ParentUri");
1078 if (str != null)
1079 hit.ParentUri = UriFu.UriStringToUri (str);
1081 hit.MimeType = doc.Get ("MimeType");
1083 hit.Source = "lucene";
1084 hit.ScoreRaw = score;
1086 foreach (Field ff in doc.Fields ()) {
1087 Property prop = FieldToProperty (ff);
1088 if (prop != null)
1089 hit.AddProperty (prop);
1092 return hit;
1096 /////////////////////////////////////////////////////
1099 // A common, shared analyzer
1102 private class BeagleNoiseFilter : TokenFilter {
1104 static int total_count = 0;
1105 static int noise_count = 0;
1107 TokenStream token_stream;
1109 public BeagleNoiseFilter (TokenStream input) : base (input)
1111 token_stream = input;
1114 // FIXME: we should add some heuristics that are stricter
1115 // but explicitly try to avoid filtering out dates,
1116 // phone numbers, etc.
1117 private static bool IsNoise (string text)
1119 // Anything really long is almost certainly noise.
1120 if (text.Length > 30)
1121 return true;
1123 // Look at how often we switch between numbers and letters.
1124 // Scoring:
1125 // <letter> <digit> 1
1126 // <digit> <letter> 1
1127 // <x> <punct>+ <x> 1
1128 // <x> <punct>+ <y> 2
1129 const int transitions_cutoff = 4;
1130 int last_type = -1, last_non_punct_type = -1, first_type = -1;
1131 bool has_letter = false, has_digit = false, has_punctuation = false;
1132 int transitions = 0;
1133 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
1134 char c = text [i];
1135 int type = -1;
1136 if (Char.IsLetter (c)) {
1137 type = 1;
1138 has_letter = true;
1139 } else if (Char.IsDigit (c)) {
1140 type = 2;
1141 has_digit = true;
1142 } else if (Char.IsPunctuation (c)) {
1143 type = 3;
1144 has_punctuation = true;
1147 if (type != -1) {
1149 if (type != last_type) {
1150 if (last_type == 3) {
1151 if (type != last_non_punct_type)
1152 ++transitions;
1153 } else {
1154 ++transitions;
1158 if (first_type == -1)
1159 first_type = type;
1161 last_type = type;
1162 if (type != 3)
1163 last_non_punct_type = type;
1167 // If we make too many transitions, it must be noise.
1168 if (transitions >= transitions_cutoff)
1169 return true;
1171 // If we consist of nothing but digits and punctuation, treat it
1172 // as noise if it is too long.
1173 if (transitions == 1 && first_type != 1 && text.Length > 10)
1174 return true;
1176 // We are very suspicious of long things that make lots of
1177 // transitions
1178 if (transitions > 3 && text.Length > 10)
1179 return true;
1181 // Beware of anything long that contains a little of everything.
1182 if (has_letter && has_digit && has_punctuation && text.Length > 10)
1183 return true;
1185 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
1186 return false;
1190 public override Lucene.Net.Analysis.Token Next ()
1192 Lucene.Net.Analysis.Token token;
1193 while ( (token = token_stream.Next ()) != null) {
1194 #if false
1195 if (total_count > 0 && total_count % 5000 == 0)
1196 Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
1197 noise_count, total_count, 100.0 * noise_count / total_count);
1198 #endif
1199 ++total_count;
1200 if (IsNoise (token.TermText ())) {
1201 ++noise_count;
1202 continue;
1204 return token;
1206 return null;
1210 // This is just a standard analyzer combined with the Porter stemmer.
1211 // FIXME: This assumes everything being indexed is in English!
1212 private class BeagleAnalyzer : StandardAnalyzer {
1213 public override TokenStream TokenStream (String fieldName, TextReader reader)
1215 TokenStream outstream = base.TokenStream (fieldName, reader);
1216 if (fieldName == "Text" || fieldName == "HotText")
1217 outstream = new BeagleNoiseFilter (outstream);
1218 outstream = new PorterStemFilter (outstream);
1219 return outstream;
1223 private static Analyzer theAnalyzer;
1225 private static Analyzer Analyzer {
1226 get {
1227 if (theAnalyzer == null)
1228 theAnalyzer = new BeagleAnalyzer ();
1229 return theAnalyzer;
1233 /////////////////////////////////////////////////////
1236 // Access to the Stemmer
1239 static PorterStemmer stemmer = new PorterStemmer ();
1241 static public string Stem (string str)
1243 return stemmer.Stem (str);
1246 /////////////////////////////////////////////////////
1248 public static bool IsStopWord (string stemmed_word)
1250 return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
1253 /////////////////////////////////////////////////////
1256 // Helpful little utility functions
1259 static private String RevisionToString (long rev)
1261 return Convert.ToString (rev);
1264 static private long StringToRevision (String str)
1266 return Convert.ToInt64 (str);
1269 /////////////////////////////////////////////////////
1271 public void Merge (string merge_dir)
1273 string merge_index_dir = Path.Combine (merge_dir, "Index");
1274 string merge_locks_dir = Path.Combine (merge_dir, "Locks");
1276 if (!Directory.Exists (merge_index_dir) || !Directory.Exists (merge_locks_dir)) {
1277 throw new Exception ("Index does not exists");
1280 // FIXME: Error recovery
1282 Lucene.Net.Store.FSDirectory store = Lucene.Net.Store.FSDirectory.GetDirectory (merge_index_dir, merge_locks_dir, false);
1283 Lucene.Net.Store.Directory[] stores = {store};
1285 IndexWriter writer = new IndexWriter (Store, null, false);
1286 writer.AddIndexes (stores);
1287 writer.Close ();
1290 /////////////////////////////////////////////////////
1292 // Expose some information for debugging and analytical purposes.
1294 public void WriteIndexTermFrequencies (TextWriter writer)
1296 IndexReader reader = IndexReader.Open (Store);
1297 TermEnum term_enum = reader.Terms ();
1299 Term term;
1301 while (term_enum.Next ()) {
1302 term = term_enum.Term ();
1303 int freq = term_enum.DocFreq ();
1304 writer.WriteLine ("{0} {1} {2}", term.Field (), freq, term.Text ());
1307 reader.Close ();