2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / LuceneCommon.cs
blob9126a9bcfcf39287cfddea9f7b9828ba01703aff
1 //
2 // LuceneCommon.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneCommon {
50 public delegate bool HitFilter (Hit hit);
52 // VERSION HISTORY
53 // ---------------
55 // 1: Original
56 // 2: Changed format of timestamp strings
57 // 3: Schema changed to be more Dashboard-Match-like
58 // 4: Schema changed for files to include _Directory property
59 // 5: Changed analyzer to support stemming. Bumped version # to
60 // force everyone to re-index.
61 // 6: lots of schema changes as part of the general refactoring
62 // 7: incremented to force a re-index after our upgrade to lucene 1.4
63 // (in theory the file formats are compatible, we are seeing 'term
64 // out of order' exceptions in some cases)
65 // 8: another forced re-index, this time because of massive changes
66 // in the file system backend (it would be nice to have per-backend
67 // versioning so that we didn't have to purge all indexes just
68 // because one changed)
69 // 9: changed the way properties are stored, changed in conjunction
70 // with sane handling of multiple properties on hits.
71 // 10: changed to support typed and mutable properties
72 // 11: moved mime type and hit type into properties
73 // 12: added year-month and year-month-day resolutions for all
74 // date properties
75 // 13: moved source into a property
76 // 14: allow wildcard queries to also match keywords
77 // 15: analyze PropertyKeyword field, and store all properties as
78 // lower case so that we're truly case insensitive.
79 private const int MAJOR_VERSION = 15;
80 private int minor_version = 0;
82 private string index_name;
83 private string top_dir;
85 private string fingerprint;
86 private int last_item_count = -1;
88 // This is the big index, containing document full-texts and
89 // data that is expensive to index.
90 private Lucene.Net.Store.Directory primary_store = null;
92 // This is the small index, containing document info that we
93 // expect to have change. Canonical example: file names.
94 private Lucene.Net.Store.Directory secondary_store = null;
96 //////////////////////////////////////////////////////////////////////////////
98 protected LuceneCommon (string index_name, int minor_version)
100 this.index_name = index_name;
101 this.minor_version = minor_version;
103 this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
106 //////////////////////////////////////////////////////////////////////////////
108 protected string IndexName { get { return index_name; } }
110 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
112 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
114 public string Fingerprint { get { return fingerprint; } }
116 public string TopDirectory { get { return top_dir; } }
118 //////////////////////////////////////////////////////////////////////////////
120 protected TextCache text_cache = null;
122 public TextCache TextCache {
123 get { return text_cache; }
124 set { text_cache = value; }
127 //////////////////////////////////////////////////////////////////////////////
129 private string VersionFile {
130 get { return Path.Combine (top_dir, "version"); }
133 private string FingerprintFile {
134 get { return Path.Combine (top_dir, "fingerprint"); }
137 // Shouldn't really be public
138 public string PrimaryIndexDirectory {
139 get { return Path.Combine (top_dir, "PrimaryIndex"); }
142 // Shouldn't really be public
143 public string SecondaryIndexDirectory {
144 get { return Path.Combine (top_dir, "SecondaryIndex"); }
147 public string LockDirectory {
148 get { return Path.Combine (top_dir, "Locks"); }
151 //////////////////////////////////////////////////////////////////////////////
153 // Deal with dangling locks
155 private bool IsDanglingLock (FileInfo info)
157 Log.Debug ("Checking for dangling locks...");
159 // It isn't even a lock file
160 if (! info.Name.EndsWith (".lock"))
161 return false;
163 StreamReader reader;
164 string pid = null;
166 try {
167 reader = new StreamReader (info.FullName);
168 pid = reader.ReadLine ();
169 reader.Close ();
171 } catch {
172 // We couldn't read the lockfile, so it probably went away.
173 return false;
177 if (pid == null) {
178 // Looks like the lock file was empty, which really
179 // shouldn't happen. It should contain the PID of
180 // the process which locked it. Lets be on the safe
181 // side and assume it's a dangling lock.
182 Log.Warn ("Found an empty lock file, that shouldn't happen: {0}", info.FullName);
183 return true;
186 string cmdline_file;
187 cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
189 string cmdline = "";
190 try {
191 reader = new StreamReader (cmdline_file);
192 cmdline = reader.ReadLine ();
193 reader.Close ();
194 } catch {
195 // If we can't open that file, either:
196 // (1) The process doesn't exist
197 // (2) It does exist, but it doesn't belong to us.
198 // Thus it isn't an IndexHelper
199 // In either case, the lock is dangling --- if it
200 // still exists.
201 return info.Exists;
204 // The process exists, but isn't an IndexHelper.
205 // If the lock file is still there, it is dangling.
206 // FIXME: During one run of bludgeon I got a null reference
207 // exception here, so I added the cmdline == null check.
208 // Why exactly would that happen? Is this logic correct
209 // in that (odd and presumably rare) case?
210 if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
211 return info.Exists;
213 // If we reach this point, we know:
214 // (1) The process still exists
215 // (2) We own it
216 // (3) It is an IndexHelper process
217 // Thus it almost certainly isn't a dangling lock.
218 // The process might be wedged, but that is
219 // another issue...
220 return false;
223 protected bool Exists ()
225 if (! (Directory.Exists (top_dir)
226 && File.Exists (VersionFile)
227 && File.Exists (FingerprintFile)
228 && Directory.Exists (PrimaryIndexDirectory)
229 && IndexReader.IndexExists (PrimaryIndexDirectory)
230 && Directory.Exists (SecondaryIndexDirectory)
231 && IndexReader.IndexExists (SecondaryIndexDirectory)
232 && Directory.Exists (LockDirectory)))
233 return false;
235 // Check the index's version number. If it is wrong,
236 // declare the index non-existent.
238 StreamReader version_reader;
239 string version_str;
240 version_reader = new StreamReader (VersionFile);
241 version_str = version_reader.ReadLine ();
242 version_reader.Close ();
244 int current_major_version, current_minor_version;
245 int i = version_str.IndexOf ('.');
247 if (i != -1) {
248 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
249 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
250 } else {
251 current_minor_version = Convert.ToInt32 (version_str);
252 current_major_version = 0;
255 if (current_major_version != MAJOR_VERSION
256 || (minor_version >= 0 && current_minor_version != minor_version)) {
257 Logger.Log.Debug ("Version mismatch in {0}", index_name);
258 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
259 current_major_version, current_minor_version,
260 MAJOR_VERSION, minor_version);
261 return false;
264 // Check the lock directory: If there is a dangling write lock,
265 // assume that the index is corrupted and declare it non-existent.
266 DirectoryInfo lock_dir_info;
267 lock_dir_info = new DirectoryInfo (LockDirectory);
268 foreach (FileInfo info in lock_dir_info.GetFiles ()) {
269 if (IsDanglingLock (info)) {
270 Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
271 return false;
275 return true;
278 private Lucene.Net.Store.Directory CreateIndex (string path)
280 // Create a directory to put the index in.
281 Directory.CreateDirectory (path);
283 // Create a new store.
284 Lucene.Net.Store.Directory store;
285 store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
287 // Create an empty index in that store.
288 IndexWriter writer;
289 writer = new IndexWriter (store, null, true);
290 writer.Close ();
292 return store;
295 // Create will kill your index dead. Use it with care.
296 // You don't need to call Open after calling Create.
297 protected void Create ()
299 if (minor_version < 0)
300 minor_version = 0;
302 // Purge any existing directories.
303 if (Directory.Exists (top_dir)) {
304 Logger.Log.Debug ("Purging {0}", top_dir);
305 Directory.Delete (top_dir, true);
308 // Create any necessary directories.
309 Directory.CreateDirectory (top_dir);
310 Directory.CreateDirectory (LockDirectory);
312 // Create the indexes.
313 primary_store = CreateIndex (PrimaryIndexDirectory);
314 secondary_store = CreateIndex (SecondaryIndexDirectory);
316 // Generate and store the index fingerprint.
317 fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
318 TextWriter writer;
319 writer = new StreamWriter (FingerprintFile, false);
320 writer.WriteLine (fingerprint);
321 writer.Close ();
323 // Store our index version information.
324 writer = new StreamWriter (VersionFile, false);
325 writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
326 writer.Close ();
329 protected void Open ()
331 Open (false);
334 protected void Open (bool read_only_mode)
336 // Read our index fingerprint.
337 TextReader reader;
338 reader = new StreamReader (FingerprintFile);
339 fingerprint = reader.ReadLine ();
340 reader.Close ();
342 // Create stores for our indexes.
343 primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
344 secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
347 ////////////////////////////////////////////////////////////////
350 // Custom Analyzers
353 private class SingletonTokenStream : TokenStream {
355 private string singleton_str;
357 public SingletonTokenStream (string singleton_str)
359 this.singleton_str = singleton_str;
362 override public Lucene.Net.Analysis.Token Next ()
364 if (singleton_str == null)
365 return null;
367 Lucene.Net.Analysis.Token token;
368 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
370 singleton_str = null;
372 return token;
376 // FIXME: This assumes everything being indexed is in English!
377 private class BeagleAnalyzer : StandardAnalyzer {
379 private char [] buffer = new char [2];
380 private bool strip_extra_property_info = false;
382 public BeagleAnalyzer (bool strip_extra_property_info)
384 this.strip_extra_property_info = strip_extra_property_info;
387 public override TokenStream TokenStream (string fieldName, TextReader reader)
389 bool is_text_prop = false;
391 // Strip off the first two characters in a property.
392 // We store type information in those two characters, so we don't
393 // want to index them.
394 if (fieldName.StartsWith ("prop:")) {
396 if (strip_extra_property_info) {
397 // Skip everything up to and including the first :
398 int c;
399 do {
400 c = reader.Read ();
401 } while (c != -1 && c != ':');
404 is_text_prop = fieldName.StartsWith ("prop:t");
406 // If this is non-text property, just return one token
407 // containing the entire string. We do this to avoid
408 // tokenizing keywords.
409 if (! is_text_prop)
410 return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
411 } else if (fieldName == "PropertyKeyword")
412 return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
414 TokenStream outstream;
415 outstream = base.TokenStream (fieldName, reader);
417 if (fieldName == "Text"
418 || fieldName == "HotText"
419 || fieldName == "PropertyText"
420 || is_text_prop) {
421 outstream = new NoiseFilter (outstream);
422 outstream = new PorterStemFilter (outstream);
425 return outstream;
429 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
430 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
432 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
433 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
435 ////////////////////////////////////////////////////////////////
438 // Dealing with properties
441 static private char TypeToCode (PropertyType type)
443 switch (type) {
444 case PropertyType.Text: return 't';
445 case PropertyType.Keyword: return 'k';
446 case PropertyType.Date: return 'd';
448 throw new Exception ("Bad property type: " + type);
451 static private PropertyType CodeToType (char c)
453 switch (c) {
454 case 't': return PropertyType.Text;
455 case 'k': return PropertyType.Keyword;
456 case 'd': return PropertyType.Date;
459 throw new Exception ("Bad property code: " + c);
462 static private string TypeToWildcardField (PropertyType type)
464 switch (type) {
465 case PropertyType.Text: return "PropertyText";
466 case PropertyType.Keyword: return "PropertyKeyword";
467 case PropertyType.Date: return "PropertyDate";
470 return null;
473 // Exposing this is a little bit suspicious.
474 static protected string PropertyToFieldName (PropertyType type, string key)
476 return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
480 static private void AddDateFields (string field_name, Property prop, Document doc)
482 DateTime dt = StringFu.StringToDateTime (prop.Value);
484 Field f;
485 f = new Field ("YM:" + field_name,
486 StringFu.DateTimeToYearMonthString (dt),
487 false, // never store
488 true, // always index
489 false); // never tokenize
490 doc.Add (f);
492 f = new Field ("D:" + field_name,
493 StringFu.DateTimeToDayString (dt),
494 false, // never store
495 true, // always index
496 false); // never tokenize
497 doc.Add (f);
500 static protected void AddPropertyToDocument (Property prop, Document doc)
502 if (prop == null || prop.Value == null)
503 return;
505 // Don't actually put properties in the UnindexedNamespace
506 // in the document. A horrible (and yet lovely!) hack.
507 if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
508 return;
510 Field f;
512 if (prop.IsSearched) {
513 string wildcard_field = TypeToWildcardField (prop.Type);
514 if (wildcard_field != null) {
515 f = new Field (wildcard_field,
516 prop.Value,
517 false, // never stored
518 true, // always indexed
519 true); // always tokenize (just lowercases for keywords; full analysis for text)
520 doc.Add (f);
522 if (prop.Type == PropertyType.Date)
523 AddDateFields (wildcard_field, prop, doc);
527 string coded_value;
528 coded_value = String.Format ("{0}:{1}",
529 prop.IsSearched ? 's' : '_',
530 prop.Value);
532 string field_name = PropertyToFieldName (prop.Type, prop.Key);
534 f = new Field (field_name,
535 coded_value,
536 prop.IsStored,
537 true, // always index
538 true); // always tokenize (strips off type code for keywords and lowercases)
539 doc.Add (f);
541 if (prop.Type == PropertyType.Date)
542 AddDateFields (field_name, prop, doc);
545 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
547 // Note: we don't use the document that we pass in,
548 // but in theory we could. At some later point we
549 // might need to split a property's data across two or
550 // more fields in the document.
552 if (f == null)
553 return null;
555 string field_name;
556 field_name = f.Name ();
557 if (field_name.Length < 7
558 || ! field_name.StartsWith ("prop:"))
559 return null;
561 string field_value;
562 field_value = f.StringValue ();
564 Property prop;
565 prop = new Property ();
566 prop.Type = CodeToType (field_name [5]);
567 prop.Key = field_name.Substring (7);
568 prop.Value = field_value.Substring (2);
569 prop.IsSearched = (field_value [0] == 's');
570 prop.IsMutable = ! from_primary_index;
571 prop.IsStored = f.IsStored ();
573 return prop;
576 //////////////////////////////////////////////////////////////////////////////
579 // Dealing with documents
582 static protected void BuildDocuments (Indexable indexable,
583 out Document primary_doc,
584 out Document secondary_doc)
586 primary_doc = new Document ();
587 secondary_doc = null;
589 Field f;
591 f = Field.Keyword ("Uri", UriFu.UriToEscapedString (indexable.Uri));
592 primary_doc.Add (f);
594 if (indexable.ParentUri != null) {
595 f = Field.Keyword ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri));
596 primary_doc.Add (f);
599 if (indexable.ValidTimestamp) {
600 // Note that we also want to search in the
601 // Timestamp field when we do a wildcard date
602 // query, so that's why we also add a wildcard
603 // field for each item here.
605 string wildcard_field = TypeToWildcardField (PropertyType.Date);
607 string str = StringFu.DateTimeToString (indexable.Timestamp);
608 f = Field.Keyword ("Timestamp", str);
609 primary_doc.Add (f);
610 f = Field.UnStored (wildcard_field, str);
611 primary_doc.Add (f);
613 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
614 f = Field.Keyword ("YM:Timestamp", str);
615 primary_doc.Add (f);
616 f = Field.UnStored ("YM:" + wildcard_field, str);
617 primary_doc.Add (f);
619 str = StringFu.DateTimeToDayString (indexable.Timestamp);
620 f = Field.Keyword ("D:Timestamp", str);
621 primary_doc.Add (f);
622 f = Field.UnStored ("D:" + wildcard_field, str);
623 primary_doc.Add (f);
626 if (indexable.NoContent) {
627 // If there is no content, make a note of that
628 // in a special property.
629 Property prop;
630 prop = Property.NewBool ("beagle:NoContent", true);
631 AddPropertyToDocument (prop, primary_doc);
633 } else {
635 // Since we might have content, add our text
636 // readers.
638 TextReader reader;
640 reader = indexable.GetTextReader ();
641 if (reader != null) {
642 f = Field.Text ("Text", reader);
643 primary_doc.Add (f);
646 reader = indexable.GetHotTextReader ();
647 if (reader != null) {
648 f = Field.Text ("HotText", reader);
649 primary_doc.Add (f);
653 // Store the Type and MimeType in special properties
655 if (indexable.HitType != null) {
656 Property prop;
657 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
658 AddPropertyToDocument (prop, primary_doc);
661 if (indexable.MimeType != null) {
662 Property prop;
663 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
664 AddPropertyToDocument (prop, primary_doc);
667 if (indexable.Source != null) {
668 Property prop;
669 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
670 AddPropertyToDocument (prop, primary_doc);
673 // Store the other properties
675 foreach (Property prop in indexable.Properties) {
676 Document target_doc = primary_doc;
677 if (prop.IsMutable) {
678 if (secondary_doc == null) {
679 secondary_doc = new Document ();
680 f = Field.Keyword ("Uri", UriFu.UriToEscapedString (indexable.Uri));
681 secondary_doc.Add (f);
683 target_doc = secondary_doc;
686 AddPropertyToDocument (prop, target_doc);
690 static protected Document RewriteDocument (Document old_secondary_doc,
691 Indexable prop_only_indexable)
693 Hashtable seen_props;
694 seen_props = new Hashtable ();
696 Document new_doc;
697 new_doc = new Document ();
699 Field uri_f;
700 uri_f = Field.Keyword ("Uri", UriFu.UriToEscapedString (prop_only_indexable.Uri));
701 new_doc.Add (uri_f);
703 Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
705 // Add the new properties to the new document. To
706 // delete a property, set the Value to null... then it
707 // will be added to seen_props (so the old value will
708 // be ignored below), but AddPropertyToDocument will
709 // return w/o doing anything.
710 foreach (Property prop in prop_only_indexable.Properties) {
711 seen_props [prop.Key] = prop;
712 AddPropertyToDocument (prop, new_doc);
713 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
716 // Copy the other properties from the old document to the
717 // new one, skipping any properties that we got new values
718 // for out of the Indexable.
719 if (old_secondary_doc != null) {
720 foreach (Field f in old_secondary_doc.Fields ()) {
721 Property prop;
722 prop = GetPropertyFromDocument (f, old_secondary_doc, false);
723 if (prop != null && ! seen_props.Contains (prop.Key)) {
724 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
725 AddPropertyToDocument (prop, new_doc);
730 return new_doc;
733 static protected Uri GetUriFromDocument (Document doc)
735 string uri;
736 uri = doc.Get ("Uri");
737 if (uri == null)
738 throw new Exception ("Got document from Lucene w/o a URI!");
739 return UriFu.EscapedStringToUri (uri);
742 static protected Hit DocumentToHit (Document doc)
744 Hit hit;
745 hit = new Hit ();
747 hit.Uri = GetUriFromDocument (doc);
749 string str;
750 str = doc.Get ("ParentUri");
751 if (str != null)
752 hit.ParentUri = UriFu.EscapedStringToUri (str);
754 hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
756 AddPropertiesToHit (hit, doc, true);
758 // Get the Type and MimeType from the properties.
759 hit.Type = hit.GetFirstProperty ("beagle:HitType");
760 hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
761 hit.Source = hit.GetFirstProperty ("beagle:Source");
763 return hit;
766 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
768 foreach (Field f in doc.Fields ()) {
769 Property prop;
770 prop = GetPropertyFromDocument (f, doc, from_primary_index);
771 if (prop != null)
772 hit.AddProperty (prop);
777 //////////////////////////////////////////////////////////////////////////////
780 // Handle the index's item count
783 public int GetItemCount ()
785 if (last_item_count < 0) {
786 IndexReader reader;
787 reader = GetReader (PrimaryStore);
788 last_item_count = reader.NumDocs ();
789 ReleaseReader (reader);
791 return last_item_count;
794 // We should set the cached count of index items when IndexReaders
795 // are open and available, so calls to GetItemCount will return immediately.
797 protected bool HaveItemCount { get { return last_item_count >= 0; } }
799 protected void SetItemCount (IndexReader reader)
801 last_item_count = reader.NumDocs ();
804 public void SetItemCount (int count)
806 last_item_count = count;
809 protected void AdjustItemCount (int delta)
811 if (last_item_count >= 0)
812 last_item_count += delta;
815 //////////////////////////////////////////////////////////////////////////////
818 // Access to the stemmer and list of stop words
821 static PorterStemmer stemmer = new PorterStemmer ();
823 static public string Stem (string str)
825 return stemmer.Stem (str);
828 public static bool IsStopWord (string stemmed_word)
830 return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
833 //////////////////////////////////////////////////////////////////////////////
836 // Special Hit Filtering classes
839 static private bool TrueHitFilter (Hit hit)
841 return true;
844 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
846 public class OrHitFilter {
848 private ArrayList all = new ArrayList ();
849 private bool contains_known_true = false;
851 public void Add (HitFilter hit_filter)
853 if (hit_filter == true_hit_filter)
854 contains_known_true = true;
855 all.Add (hit_filter);
858 public bool HitFilter (Hit hit)
860 if (contains_known_true)
861 return true;
862 foreach (HitFilter hit_filter in all)
863 if (hit_filter (hit))
864 return true;
865 return false;
869 public class AndHitFilter {
871 private ArrayList all = new ArrayList ();
873 public void Add (HitFilter hit_filter)
875 all.Add (hit_filter);
878 public bool HitFilter (Hit hit)
880 foreach (HitFilter hit_filter in all)
881 if (! hit_filter (hit))
882 return false;
883 return true;
887 public class NotHitFilter {
888 HitFilter original;
890 public NotHitFilter (HitFilter original)
892 this.original = original;
895 public bool HitFilter (Hit hit)
897 return ! original (hit);
901 //////////////////////////////////////////////////////////////////////////////
904 // Queries
907 static private LNS.Query StringToQuery (string field_name,
908 string text,
909 ArrayList term_list)
911 ArrayList tokens = new ArrayList ();
913 // Use the analyzer to extract the query's tokens.
914 // This code is taken from Lucene's query parser.
915 TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
916 while (true) {
917 Lucene.Net.Analysis.Token token;
918 try {
919 token = source.Next ();
920 if (token == null)
921 break;
922 } catch (IOException) {
923 break;
925 if (token != null)
926 tokens.Add (token.TermText ());
928 try {
929 source.Close ();
930 } catch (IOException) {
931 // ignore
934 if (tokens.Count == 0)
935 return null;
937 LNS.PhraseQuery query = new LNS.PhraseQuery ();
939 foreach (string token in tokens) {
940 Term term;
941 term = new Term (field_name, token);
942 query.Add (term);
943 if (term_list != null)
944 term_list.Add (term);
947 return query;
951 // Date Range Handling
954 // This function will break down dates to discrete chunks of
955 // time to avoid expanding RangeQuerys as much as possible.
956 // For example, searching for
958 // YMD(5 May 2005, 16 Oct 2006)
960 // would break down into three queries:
962 // (YM(May 2005) AND D(5,31)) OR
963 // YM(Jun 2005, Sep 2006) OR
964 // (YM(Oct 2006) AND D(1,16))
966 static private DateTime lower_bound = new DateTime (1970, 1, 1);
968 // FIXME: we should probably boost this sometime around 2030.
969 // Mark your calendar.
970 static private DateTime upper_bound = new DateTime (2038, 12, 31);
972 static private Term NewYearMonthTerm (string field_name, int y, int m)
974 return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
977 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
979 return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
982 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
984 return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
985 NewYearMonthTerm (field_name, y2, m2),
986 true); // query is inclusive
989 static private Term NewDayTerm (string field_name, int d)
991 return new Term ("D:" + field_name, String.Format ("{0:00}", d));
994 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
996 return new LNS.RangeQuery (NewDayTerm (field_name, d1),
997 NewDayTerm (field_name, d2),
998 true); // query is inclusive
1001 private class DateRangeHitFilter {
1002 public string Key;
1003 public DateTime StartDate;
1004 public DateTime EndDate;
1006 public bool HitFilter (Hit hit)
1008 // First, check the Timestamp
1009 if (Key == QueryPart_DateRange.AllPropertiesKey
1010 || Key == QueryPart_DateRange.TimestampKey) {
1011 DateTime dt;
1012 dt = hit.Timestamp;
1013 if (StartDate <= dt && dt <= EndDate)
1014 return true;
1015 if (Key == QueryPart_DateRange.TimestampKey)
1016 return false;
1019 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1020 // Walk through all of the properties, and see if any
1021 // date properties fall inside the range.
1022 foreach (Property prop in hit.Properties) {
1023 if (prop.Type == PropertyType.Date) {
1024 DateTime dt;
1025 dt = StringFu.StringToDateTime (prop.Value);
1026 if (StartDate <= dt && dt <= EndDate)
1027 return true;
1030 return false;
1031 } else {
1032 // Walk through all of the properties with the given key,
1033 // and see if any of them fall inside of the range.
1034 string[] values;
1035 values = hit.GetProperties (Key);
1036 foreach (string v in values) {
1037 DateTime dt;
1038 dt = StringFu.StringToDateTime (v);
1039 if (StartDate <= dt && dt <= EndDate)
1040 return true;
1042 return false;
1047 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1049 string field_name;
1050 if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1051 field_name = TypeToWildcardField (PropertyType.Date);
1052 else if (part.Key == QueryPart_DateRange.TimestampKey)
1053 field_name = "Timestamp";
1054 else
1055 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1057 // FIXME: We could optimize this and reduce the size of our range
1058 // queries if we actually new the min and max date that appear in
1059 // any properties in the index. We would need to inspect the index to
1060 // determine that at start-up, and then track it as new documents
1061 // get added to the index.
1062 if (part.StartDate < lower_bound)
1063 part.StartDate = lower_bound;
1064 if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1065 part.EndDate = upper_bound;
1067 // Swap the start and end dates if they come in reversed.
1068 if (part.StartDate > part.EndDate) {
1069 DateTime swap;
1070 swap = part.StartDate;
1071 part.StartDate = part.EndDate;
1072 part.EndDate = swap;
1075 // Set up our hit filter to cull out the bad dates.
1076 DateRangeHitFilter drhf;
1077 drhf = new DateRangeHitFilter ();
1078 drhf.Key = part.Key;
1079 drhf.StartDate = part.StartDate;
1080 drhf.EndDate = part.EndDate;
1081 hit_filter = new HitFilter (drhf.HitFilter);
1083 Logger.Log.Debug ("Building new date range query");
1084 Logger.Log.Debug ("Start: {0}", part.StartDate);
1085 Logger.Log.Debug ("End: {0}", part.EndDate);
1087 int y1, m1, d1, y2, m2, d2;
1088 y1 = part.StartDate.Year;
1089 m1 = part.StartDate.Month;
1090 d1 = part.StartDate.Day;
1091 y2 = part.EndDate.Year;
1092 m2 = part.EndDate.Month;
1093 d2 = part.EndDate.Day;
1095 LNS.BooleanQuery top_level_query;
1096 top_level_query = new LNS.BooleanQuery ();
1098 // A special case: both the start and the end of our range fall
1099 // in the same month.
1100 if (y1 == y2 && m1 == m2) {
1101 LNS.Query ym_query;
1102 ym_query = NewYearMonthQuery (field_name, y1, m1);
1104 // If our range only covers a part of the month, do a range query on the days.
1105 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1106 LNS.BooleanQuery sub_query;
1107 sub_query = new LNS.BooleanQuery ();
1108 sub_query.Add (ym_query, true, false);
1109 sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1110 top_level_query.Add (sub_query, false, false);
1111 } else {
1112 top_level_query.Add (ym_query, false, false);
1115 } else {
1117 // Handle a partial month at the beginning of our range.
1118 if (d1 > 1) {
1119 LNS.BooleanQuery sub_query;
1120 sub_query = new LNS.BooleanQuery ();
1121 sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1122 sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1123 top_level_query.Add (sub_query, false, false);
1125 ++m1;
1126 if (m1 == 13) {
1127 m1 = 1;
1128 ++y1;
1132 // And likewise, handle a partial month at the end of our range.
1133 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1134 LNS.BooleanQuery sub_query;
1135 sub_query = new LNS.BooleanQuery ();
1136 sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1137 sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1138 top_level_query.Add (sub_query, false, false);
1140 --m2;
1141 if (m2 == 0) {
1142 m2 = 12;
1143 --y2;
1147 // Generate the query for the "middle" of our period, if it is non-empty
1148 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1149 top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1150 false, false);
1153 return top_level_query;
1156 // search_subset_uris is a list of Uris that this search should be
1157 // limited to.
1158 static protected void QueryPartToQuery (QueryPart abstract_part,
1159 bool only_build_primary_query,
1160 ArrayList term_list,
1161 out LNS.Query primary_query,
1162 out LNS.Query secondary_query,
1163 out HitFilter hit_filter)
1165 primary_query = null;
1166 secondary_query = null;
1168 // By default, we assume that our lucene queries will return exactly the
1169 // matching set of objects. We need to set the hit filter if further
1170 // refinement of the search results is required. (As in the case of
1171 // date range queries, for example.) We essentially have to do this
1172 // to make OR queries work correctly.
1173 hit_filter = true_hit_filter;
1175 // The exception is when dealing with a prohibited part. Just return
1176 // null for the hit filter in that case. This works since
1177 // prohibited parts are not allowed inside of OR queries.
1178 if (abstract_part.Logic == QueryPartLogic.Prohibited)
1179 hit_filter = null;
1181 if (abstract_part == null)
1182 return;
1184 if (abstract_part is QueryPart_Text) {
1185 QueryPart_Text part = (QueryPart_Text) abstract_part;
1187 if (! (part.SearchFullText || part.SearchTextProperties))
1188 return;
1190 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1191 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1193 if (part.SearchFullText) {
1194 LNS.Query subquery;
1195 subquery = StringToQuery ("Text", part.Text, term_list);
1196 if (subquery != null)
1197 p_query.Add (subquery, false, false);
1199 // FIXME: HotText is ignored for now!
1200 // subquery = StringToQuery ("HotText", part.Text);
1201 // if (subquery != null)
1202 // p_query.Add (subquery, false, false);
1205 if (part.SearchTextProperties) {
1206 LNS.Query subquery;
1207 subquery = StringToQuery ("PropertyText", part.Text, term_list);
1208 if (subquery != null) {
1209 p_query.Add (subquery, false, false);
1210 // Properties can live in either index
1211 if (! only_build_primary_query)
1212 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1215 Term term;
1216 term = new Term ("PropertyKeyword", part.Text.ToLower ()); // make sure text is lowercased
1217 // FIXME: terms are already added in term_list. But they may have been tokenized
1218 // The term here is non-tokenized version. Should this be added to term_list ?
1219 // term_list is used to calculate scores
1220 if (term_list != null)
1221 term_list.Add (term);
1222 subquery = new LNS.TermQuery (term);
1223 p_query.Add (subquery, false, false);
1224 // Properties can live in either index
1225 if (! only_build_primary_query)
1226 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1229 primary_query = p_query;
1230 if (! only_build_primary_query)
1231 secondary_query = s_query;
1233 return;
1236 if (abstract_part is QueryPart_Wildcard) {
1237 QueryPart_Wildcard part = (QueryPart_Wildcard) abstract_part;
1239 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1240 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1242 Term term;
1243 LNS.Query subquery;
1245 // Lower case the terms for searching
1246 string query_string_lower = part.QueryString.ToLower ();
1248 // Search text content
1249 term = new Term ("Text", query_string_lower);
1250 subquery = new LNS.WildcardQuery (term);
1251 p_query.Add (subquery, false, false);
1252 term_list.Add (term);
1254 // Search text properties
1255 term = new Term ("PropertyText", query_string_lower);
1256 subquery = new LNS.WildcardQuery (term);
1257 p_query.Add (subquery, false, false);
1258 // Properties can live in either index
1259 if (! only_build_primary_query)
1260 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1261 term_list.Add (term);
1263 // Search property keywords
1264 term = new Term ("PropertyKeyword", query_string_lower);
1265 term_list.Add (term);
1266 subquery = new LNS.WildcardQuery (term);
1267 p_query.Add (subquery, false, false);
1268 // Properties can live in either index
1269 if (! only_build_primary_query)
1270 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1272 primary_query = p_query;
1273 if (! only_build_primary_query)
1274 secondary_query = s_query;
1276 return;
1279 if (abstract_part is QueryPart_Property) {
1280 QueryPart_Property part = (QueryPart_Property) abstract_part;
1282 string field_name;
1283 if (part.Key == QueryPart_Property.AllProperties) {
1284 field_name = TypeToWildcardField (part.Type);
1285 // FIXME: probably shouldn't just return silently
1286 if (field_name == null)
1287 return;
1288 } else
1289 field_name = PropertyToFieldName (part.Type, part.Key);
1291 if (part.Type == PropertyType.Text)
1292 primary_query = StringToQuery (field_name, part.Value, term_list);
1293 else {
1294 Term term;
1295 term = new Term (field_name, part.Value.ToLower ());
1296 if (term_list != null)
1297 term_list.Add (term);
1298 primary_query = new LNS.TermQuery (term);
1301 // Properties can live in either index
1302 if (! only_build_primary_query && primary_query != null)
1303 secondary_query = primary_query.Clone () as LNS.Query;
1305 return;
1308 if (abstract_part is QueryPart_DateRange) {
1310 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1312 primary_query = GetDateRangeQuery (part, out hit_filter);
1313 // Date properties can live in either index
1314 if (! only_build_primary_query && primary_query != null)
1315 secondary_query = primary_query.Clone () as LNS.Query;
1317 // If this is a prohibited part, invert our hit filter.
1318 if (part.Logic == QueryPartLogic.Prohibited) {
1319 NotHitFilter nhf;
1320 nhf = new NotHitFilter (hit_filter);
1321 hit_filter = new HitFilter (nhf.HitFilter);
1324 return;
1327 if (abstract_part is QueryPart_Or) {
1328 QueryPart_Or part = (QueryPart_Or) abstract_part;
1330 // Assemble a new BooleanQuery combining all of the sub-parts.
1331 LNS.BooleanQuery p_query;
1332 p_query = new LNS.BooleanQuery ();
1334 LNS.BooleanQuery s_query = null;
1335 if (! only_build_primary_query)
1336 s_query = new LNS.BooleanQuery ();
1338 primary_query = p_query;
1339 secondary_query = s_query;
1341 OrHitFilter or_hit_filter = null;
1343 foreach (QueryPart sub_part in part.SubParts) {
1344 LNS.Query p_subq, s_subq;
1345 HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1346 // FIXME: Any subpart in an OR which has a hit filter won't work
1347 // correctly, because we can't tell which part of an OR we matched
1348 // against to filter correctly. This affects date range queries.
1349 QueryPartToQuery (sub_part, only_build_primary_query,
1350 term_list,
1351 out p_subq, out s_subq, out sub_hit_filter);
1352 if (p_subq != null)
1353 p_query.Add (p_subq, false, false);
1354 if (s_subq != null)
1355 s_query.Add (s_subq, false, false);
1356 if (sub_hit_filter != null) {
1357 if (or_hit_filter == null)
1358 or_hit_filter = new OrHitFilter ();
1359 or_hit_filter.Add (sub_hit_filter);
1363 if (or_hit_filter != null)
1364 hit_filter = new HitFilter (or_hit_filter.HitFilter);
1366 return;
1369 throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1372 static protected LNS.Query UriQuery (string field_name, Uri uri)
1374 return new LNS.TermQuery (new Term (field_name, UriFu.UriToEscapedString (uri)));
1377 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1379 return UriQuery (field_name, uri_list, null);
1382 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1384 if (uri_list.Count == 0)
1385 return null;
1387 int max_clauses;
1388 max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1390 int N;
1391 N = 1 + (uri_list.Count - 1) / max_clauses;
1393 LNS.BooleanQuery top_query;
1394 top_query = new LNS.BooleanQuery ();
1396 int cursor = 0;
1397 if (extra_requirement != null) {
1398 top_query.Add (extra_requirement, true, false);
1399 ++cursor;
1402 ArrayList bottom_queries = null;
1404 if (N > 1) {
1405 bottom_queries = new ArrayList ();
1406 for (int i = 0; i < N; ++i) {
1407 LNS.BooleanQuery bq;
1408 bq = new LNS.BooleanQuery ();
1409 bottom_queries.Add (bq);
1410 top_query.Add (bq, false, false);
1414 foreach (Uri uri in uri_list) {
1415 LNS.Query subquery;
1416 subquery = UriQuery (field_name, uri);
1418 LNS.BooleanQuery target;
1419 if (N == 1)
1420 target = top_query;
1421 else {
1422 target = (LNS.BooleanQuery) bottom_queries [cursor];
1423 ++cursor;
1424 if (cursor >= N)
1425 cursor = 0;
1428 target.Add (subquery, false, false);
1431 return top_query;
1434 ///////////////////////////////////////////////////////////////////////////////////
1436 public int SegmentCount {
1437 get {
1438 DirectoryInfo dir_info;
1439 int p_count = 0, s_count = 0;
1441 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1442 foreach (FileInfo file_info in dir_info.GetFiles ())
1443 if (file_info.Extension == ".cfs")
1444 ++p_count;
1446 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1447 foreach (FileInfo file_info in dir_info.GetFiles ())
1448 if (file_info.Extension == ".cfs")
1449 ++s_count;
1451 return p_count > s_count ? p_count : s_count;
1455 ///////////////////////////////////////////////////////////////////////////////////
1457 // Cache IndexReaders on a per-Lucene index basis, since they
1458 // are extremely expensive to create. Note that using this
1459 // only makes sense in situations where the index only
1460 // possibly might change from underneath us, but most of the
1461 // time probably won't. This means it makes sense to do
1462 // this in LuceneQueryingDriver.cs, but it doesn't in
1463 // LuceneIndexingDriver.cs.
1465 private class ReaderAndVersion {
1467 public IndexReader Reader;
1468 public long Version;
1469 public int Refcount;
1471 public ReaderAndVersion (IndexReader reader, long version)
1473 this.Reader = reader;
1474 this.Version = version;
1475 this.Refcount = 1;
1479 static private Hashtable directory_rav_map = new Hashtable ();
1480 static private Hashtable reader_rav_map = new Hashtable ();
1482 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1484 IndexReader reader = GetReader (directory);
1486 return new LNS.IndexSearcher (reader);
1489 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1491 IndexReader reader;
1492 long version;
1494 lock (reader_rav_map) {
1495 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1497 if (rav == null) {
1498 version = IndexReader.GetCurrentVersion (directory);
1499 reader = IndexReader.Open (directory);
1501 rav = new ReaderAndVersion (reader, version);
1502 rav.Refcount++;
1504 directory_rav_map [directory] = rav;
1505 reader_rav_map [reader] = rav;
1507 return reader;
1510 version = IndexReader.GetCurrentVersion (directory);
1512 if (version != rav.Version) {
1513 UnrefReaderAndVersion_Unlocked (rav);
1515 reader = IndexReader.Open (directory);
1517 rav = new ReaderAndVersion (reader, version);
1518 rav.Refcount++;
1520 directory_rav_map [directory] = rav;
1521 reader_rav_map [reader] = rav;
1522 } else
1523 rav.Refcount++;
1525 return rav.Reader;
1529 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1531 rav.Refcount--;
1533 if (rav.Refcount == 0) {
1534 rav.Reader.Close ();
1535 reader_rav_map.Remove (rav.Reader);
1539 static public void ReleaseReader (IndexReader reader)
1541 lock (reader_rav_map) {
1542 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1544 UnrefReaderAndVersion_Unlocked (rav);
1548 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1550 IndexReader reader = searcher.GetIndexReader ();
1552 searcher.Close ();
1553 ReleaseReader (reader);
1556 ///////////////////////////////////////////////////////////////////////////////////
1559 // Various ways to grab lots of hits at once.
1560 // These should never be used for querying, only for utility
1561 // functions.
1564 public int GetBlockOfHits (int cookie,
1565 Hit [] block_of_hits)
1567 IndexReader primary_reader;
1568 IndexReader secondary_reader;
1569 primary_reader = GetReader (PrimaryStore);
1570 secondary_reader = GetReader (SecondaryStore);
1572 int request_size;
1573 request_size = block_of_hits.Length;
1574 if (request_size > primary_reader.NumDocs ())
1575 request_size = primary_reader.NumDocs ();
1577 int max_doc;
1578 max_doc = primary_reader.MaxDoc ();
1580 if (cookie < 0) {
1581 Random random;
1582 random = new Random ();
1583 cookie = random.Next (max_doc);
1586 int original_cookie;
1587 original_cookie = cookie;
1589 Hashtable primary_docs, secondary_docs;
1590 primary_docs = UriFu.NewHashtable ();
1591 secondary_docs = UriFu.NewHashtable ();
1593 // Load the primary documents
1594 for (int i = 0; i < request_size; ++i) {
1596 if (! primary_reader.IsDeleted (cookie)) {
1597 Document doc;
1598 doc = primary_reader.Document (cookie);
1599 primary_docs [GetUriFromDocument (doc)] = doc;
1602 ++cookie;
1603 if (cookie >= max_doc) // wrap around
1604 cookie = 0;
1606 // If we somehow end up back where we started,
1607 // give up.
1608 if (cookie == original_cookie)
1609 break;
1612 // If necessary, load the secondary documents
1613 if (secondary_reader != null) {
1614 LNS.IndexSearcher searcher;
1615 searcher = new LNS.IndexSearcher (secondary_reader);
1617 LNS.Query uri_query;
1618 uri_query = UriQuery ("Uri", primary_docs.Keys);
1620 LNS.Hits hits;
1621 hits = searcher.Search (uri_query);
1622 for (int i = 0; i < hits.Length (); ++i) {
1623 Document doc;
1624 doc = hits.Doc (i);
1625 secondary_docs [GetUriFromDocument (doc)] = doc;
1628 searcher.Close ();
1631 ReleaseReader (primary_reader);
1632 ReleaseReader (secondary_reader);
1634 // Now assemble the hits
1635 int j = 0;
1636 foreach (Uri uri in primary_docs.Keys) {
1637 Document primary_doc, secondary_doc;
1638 primary_doc = primary_docs [uri] as Document;
1639 secondary_doc = secondary_docs [uri] as Document;
1641 Hit hit;
1642 hit = DocumentToHit (primary_doc);
1643 if (secondary_doc != null)
1644 AddPropertiesToHit (hit, secondary_doc, false);
1646 block_of_hits [j] = hit;
1647 ++j;
1650 // null-pad the array, if necessary
1651 for (; j < block_of_hits.Length; ++j)
1652 block_of_hits [j] = null;
1655 // Return the new cookie
1656 return cookie;
1659 // For a large index, this will be very slow and will consume
1660 // a lot of memory. Don't call it without a good reason!
1661 // We return a hashtable indexed by Uri.
1662 public Hashtable GetAllHitsByUri ()
1664 Hashtable all_hits;
1665 all_hits = UriFu.NewHashtable ();
1667 IndexReader primary_reader;
1668 IndexReader secondary_reader;
1669 primary_reader = GetReader (PrimaryStore);
1670 secondary_reader = GetReader (SecondaryStore);
1672 // Load everything from the primary index
1673 int max_doc;
1674 max_doc = primary_reader.MaxDoc ();
1675 for (int i = 0; i < max_doc; ++i) {
1677 if (primary_reader.IsDeleted (i))
1678 continue;
1680 Document doc;
1681 doc = primary_reader.Document (i);
1683 Hit hit;
1684 hit = DocumentToHit (doc);
1685 all_hits [hit.Uri] = hit;
1688 // Now add in everything from the secondary index, if it exists
1689 if (secondary_reader != null) {
1690 max_doc = secondary_reader.MaxDoc ();
1691 for (int i = 0; i < max_doc; ++i) {
1693 if (secondary_reader.IsDeleted (i))
1694 continue;
1696 Document doc;
1697 doc = secondary_reader.Document (i);
1699 Uri uri;
1700 uri = GetUriFromDocument (doc);
1702 Hit hit;
1703 hit = (Hit) all_hits [uri];
1704 if (hit != null)
1705 AddPropertiesToHit (hit, doc, false);
1709 ReleaseReader (primary_reader);
1710 ReleaseReader (secondary_reader);
1712 return all_hits;