4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
29 using System
.Diagnostics
;
30 using System
.Globalization
;
33 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Analysis
;
38 using Lucene
.Net
.Analysis
.Standard
;
39 using Lucene
.Net
.Documents
;
40 using Lucene
.Net
.Index
;
41 using Lucene
.Net
.QueryParsers
;
42 using LNS
= Lucene
.Net
.Search
;
46 namespace Beagle
.Daemon
{
48 public class LuceneCommon
{
50 public delegate bool HitFilter (Hit hit
);
56 // 2: Changed format of timestamp strings
57 // 3: Schema changed to be more Dashboard-Match-like
58 // 4: Schema changed for files to include _Directory property
59 // 5: Changed analyzer to support stemming. Bumped version # to
60 // force everyone to re-index.
61 // 6: lots of schema changes as part of the general refactoring
62 // 7: incremented to force a re-index after our upgrade to lucene 1.4
63 // (in theory the file formats are compatible, we are seeing 'term
64 // out of order' exceptions in some cases)
65 // 8: another forced re-index, this time because of massive changes
66 // in the file system backend (it would be nice to have per-backend
67 // versioning so that we didn't have to purge all indexes just
68 // because one changed)
69 // 9: changed the way properties are stored, changed in conjunction
70 // with sane handling of multiple properties on hits.
71 // 10: changed to support typed and mutable properties
72 // 11: moved mime type and hit type into properties
73 // 12: added year-month and year-month-day resolutions for all
75 // 13: moved source into a property
76 // 14: allow wildcard queries to also match keywords
77 // 15: analyze PropertyKeyword field, and store all properties as
78 // lower case so that we're truly case insensitive.
79 // 16: add inverted timestamp to make querying substantially faster
80 // 17: add boolean property to denote a child indexable
81 private const int MAJOR_VERSION
= 17;
82 private int minor_version
= 0;
84 private string index_name
;
85 private string top_dir
;
87 private string fingerprint
;
88 private int last_item_count
= -1;
90 // This is the big index, containing document full-texts and
91 // data that is expensive to index.
92 private Lucene
.Net
.Store
.Directory primary_store
= null;
94 // This is the small index, containing document info that we
95 // expect to have change. Canonical example: file names.
96 private Lucene
.Net
.Store
.Directory secondary_store
= null;
98 //////////////////////////////////////////////////////////////////////////////
100 protected LuceneCommon (string index_name
, int minor_version
)
102 this.index_name
= index_name
;
103 this.minor_version
= minor_version
;
105 this.top_dir
= (Path
.IsPathRooted (index_name
)) ? index_name
: Path
.Combine (PathFinder
.IndexDir
, index_name
);
108 //////////////////////////////////////////////////////////////////////////////
110 protected string IndexName { get { return index_name; }
}
112 public Lucene
.Net
.Store
.Directory PrimaryStore { get { return primary_store; }
}
114 public Lucene
.Net
.Store
.Directory SecondaryStore { get { return secondary_store; }
}
116 public string Fingerprint { get { return fingerprint; }
}
118 public string TopDirectory { get { return top_dir; }
}
120 //////////////////////////////////////////////////////////////////////////////
122 protected TextCache text_cache
= null;
124 public TextCache TextCache
{
125 get { return text_cache; }
126 set { text_cache = value; }
129 //////////////////////////////////////////////////////////////////////////////
131 private string VersionFile
{
132 get { return Path.Combine (top_dir, "version"); }
135 private string FingerprintFile
{
136 get { return Path.Combine (top_dir, "fingerprint"); }
139 // Shouldn't really be public
140 public string PrimaryIndexDirectory
{
141 get { return Path.Combine (top_dir, "PrimaryIndex"); }
144 // Shouldn't really be public
145 public string SecondaryIndexDirectory
{
146 get { return Path.Combine (top_dir, "SecondaryIndex"); }
149 public string LockDirectory
{
150 get { return Path.Combine (top_dir, "Locks"); }
153 //////////////////////////////////////////////////////////////////////////////
155 // Deal with dangling locks
157 private bool IsDanglingLock (FileInfo info
)
159 Log
.Debug ("Checking for dangling locks...");
161 // It isn't even a lock file
162 if (! info
.Name
.EndsWith (".lock"))
169 reader
= new StreamReader (info
.FullName
);
170 pid
= reader
.ReadLine ();
174 // We couldn't read the lockfile, so it probably went away.
180 // Looks like the lock file was empty, which really
181 // shouldn't happen. It should contain the PID of
182 // the process which locked it. Lets be on the safe
183 // side and assume it's a dangling lock.
184 Log
.Warn ("Found an empty lock file, that shouldn't happen: {0}", info
.FullName
);
189 cmdline_file
= String
.Format ("/proc/{0}/cmdline", pid
);
193 reader
= new StreamReader (cmdline_file
);
194 cmdline
= reader
.ReadLine ();
197 // If we can't open that file, either:
198 // (1) The process doesn't exist
199 // (2) It does exist, but it doesn't belong to us.
200 // Thus it isn't an IndexHelper
201 // In either case, the lock is dangling --- if it
206 // The process exists, but isn't an IndexHelper.
207 // If the lock file is still there, it is dangling.
208 // FIXME: During one run of bludgeon I got a null reference
209 // exception here, so I added the cmdline == null check.
210 // Why exactly would that happen? Is this logic correct
211 // in that (odd and presumably rare) case?
212 if (cmdline
== null || cmdline
.IndexOf ("IndexHelper.exe") == -1)
215 // If we reach this point, we know:
216 // (1) The process still exists
218 // (3) It is an IndexHelper process
219 // Thus it almost certainly isn't a dangling lock.
220 // The process might be wedged, but that is
225 protected bool Exists ()
227 if (! (Directory
.Exists (top_dir
)
228 && File
.Exists (VersionFile
)
229 && File
.Exists (FingerprintFile
)
230 && Directory
.Exists (PrimaryIndexDirectory
)
231 && IndexReader
.IndexExists (PrimaryIndexDirectory
)
232 && Directory
.Exists (SecondaryIndexDirectory
)
233 && IndexReader
.IndexExists (SecondaryIndexDirectory
)
234 && Directory
.Exists (LockDirectory
)))
237 // Check the index's version number. If it is wrong,
238 // declare the index non-existent.
240 StreamReader version_reader
;
242 version_reader
= new StreamReader (VersionFile
);
243 version_str
= version_reader
.ReadLine ();
244 version_reader
.Close ();
246 int current_major_version
, current_minor_version
;
247 int i
= version_str
.IndexOf ('.');
250 current_major_version
= Convert
.ToInt32 (version_str
.Substring (0, i
));
251 current_minor_version
= Convert
.ToInt32 (version_str
.Substring (i
+1));
253 current_minor_version
= Convert
.ToInt32 (version_str
);
254 current_major_version
= 0;
257 if (current_major_version
!= MAJOR_VERSION
258 || (minor_version
>= 0 && current_minor_version
!= minor_version
)) {
259 Logger
.Log
.Debug ("Version mismatch in {0}", index_name
);
260 Logger
.Log
.Debug ("Index has version {0}.{1}, expected {2}.{3}",
261 current_major_version
, current_minor_version
,
262 MAJOR_VERSION
, minor_version
);
266 // Check the lock directory: If there is a dangling write lock,
267 // assume that the index is corrupted and declare it non-existent.
268 DirectoryInfo lock_dir_info
;
269 lock_dir_info
= new DirectoryInfo (LockDirectory
);
270 foreach (FileInfo info
in lock_dir_info
.GetFiles ()) {
271 if (IsDanglingLock (info
)) {
272 Logger
.Log
.Warn ("Found a dangling index lock on {0}", info
.FullName
);
280 private Lucene
.Net
.Store
.Directory
CreateIndex (string path
)
282 // Create a directory to put the index in.
283 Directory
.CreateDirectory (path
);
285 // Create a new store.
286 Lucene
.Net
.Store
.Directory store
;
287 store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (path
, LockDirectory
, true);
289 // Create an empty index in that store.
291 writer
= new IndexWriter (store
, null, true);
297 // Create will kill your index dead. Use it with care.
298 // You don't need to call Open after calling Create.
299 protected void Create ()
301 if (minor_version
< 0)
304 // Purge any existing directories.
305 if (Directory
.Exists (top_dir
)) {
306 Logger
.Log
.Debug ("Purging {0}", top_dir
);
307 Directory
.Delete (top_dir
, true);
310 // Create any necessary directories.
311 Directory
.CreateDirectory (top_dir
);
312 Directory
.CreateDirectory (LockDirectory
);
314 // Create the indexes.
315 primary_store
= CreateIndex (PrimaryIndexDirectory
);
316 secondary_store
= CreateIndex (SecondaryIndexDirectory
);
318 // Generate and store the index fingerprint.
319 fingerprint
= GuidFu
.ToShortString (Guid
.NewGuid ());
321 writer
= new StreamWriter (FingerprintFile
, false);
322 writer
.WriteLine (fingerprint
);
325 // Store our index version information.
326 writer
= new StreamWriter (VersionFile
, false);
327 writer
.WriteLine ("{0}.{1}", MAJOR_VERSION
, minor_version
);
331 protected void Open ()
336 protected void Open (bool read_only_mode
)
338 // Read our index fingerprint.
340 reader
= new StreamReader (FingerprintFile
);
341 fingerprint
= reader
.ReadLine ();
344 // Create stores for our indexes.
345 primary_store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (PrimaryIndexDirectory
, LockDirectory
, false, read_only_mode
);
346 secondary_store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (SecondaryIndexDirectory
, LockDirectory
, false, read_only_mode
);
349 ////////////////////////////////////////////////////////////////
355 private class SingletonTokenStream
: TokenStream
{
357 private string singleton_str
;
359 public SingletonTokenStream (string singleton_str
)
361 this.singleton_str
= singleton_str
;
364 override public Lucene
.Net
.Analysis
.Token
Next ()
366 if (singleton_str
== null)
369 Lucene
.Net
.Analysis
.Token token
;
370 token
= new Lucene
.Net
.Analysis
.Token (singleton_str
, 0, singleton_str
.Length
);
372 singleton_str
= null;
378 // FIXME: This assumes everything being indexed is in English!
379 internal class BeagleAnalyzer
: StandardAnalyzer
{
381 private char [] buffer
= new char [2];
382 private bool strip_extra_property_info
= false;
383 private bool tokenize_email_hostname
= false;
385 public BeagleAnalyzer (bool is_indexing_analyzer
)
387 if (is_indexing_analyzer
) {
388 this.strip_extra_property_info
= true;
389 this.tokenize_email_hostname
= true;
391 this.strip_extra_property_info
= false;
392 this.tokenize_email_hostname
= false;
396 public override TokenStream
TokenStream (string fieldName
, TextReader reader
)
398 bool is_text_prop
= false;
400 // Strip off the first two characters in a property.
401 // We store type information in those two characters, so we don't
402 // want to index them.
403 if (fieldName
.StartsWith ("prop:")) {
405 if (strip_extra_property_info
) {
406 // Skip everything up to and including the first :
410 } while (c
!= -1 && c
!= ':');
413 is_text_prop
= fieldName
.StartsWith ("prop:t");
415 // If this is non-text property, just return one token
416 // containing the entire string. We do this to avoid
417 // tokenizing keywords.
418 if (! is_text_prop
) {
419 // We don't want to lower case the token if it's
420 // not in the private namespace.
422 TokenStream singleton_stream
= new SingletonTokenStream (reader
.ReadToEnd ());
424 if (fieldName
.StartsWith ("prop:k:" + Property
.PrivateNamespace
))
425 return singleton_stream
;
427 return new LowerCaseFilter (singleton_stream
);
429 } else if (fieldName
== "PropertyKeyword")
430 return new LowerCaseFilter (new SingletonTokenStream (reader
.ReadToEnd ()));
432 TokenStream outstream
;
433 outstream
= base.TokenStream (fieldName
, reader
);
435 if (fieldName
== "Text"
436 || fieldName
== "HotText"
437 || fieldName
== "PropertyText"
439 outstream
= new NoiseEmailHostFilter (outstream
, tokenize_email_hostname
);
440 outstream
= new PorterStemFilter (outstream
);
447 static private Analyzer indexing_analyzer
= new BeagleAnalyzer (true);
448 static private Analyzer query_analyzer
= new BeagleAnalyzer (false);
450 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; }
}
451 static protected Analyzer QueryAnalyzer { get { return query_analyzer; }
}
453 ////////////////////////////////////////////////////////////////
456 // Dealing with properties
459 static private char TypeToCode (PropertyType type
)
462 case PropertyType
.Text
: return 't';
463 case PropertyType
.Keyword
: return 'k';
464 case PropertyType
.Date
: return 'd';
466 throw new Exception ("Bad property type: " + type
);
469 static private PropertyType
CodeToType (char c
)
472 case 't': return PropertyType
.Text
;
473 case 'k': return PropertyType
.Keyword
;
474 case 'd': return PropertyType
.Date
;
477 throw new Exception ("Bad property code: " + c
);
480 static private string TypeToWildcardField (PropertyType type
)
483 case PropertyType
.Text
: return "PropertyText";
484 case PropertyType
.Keyword
: return "PropertyKeyword";
485 case PropertyType
.Date
: return "PropertyDate";
488 throw new Exception ("Bad property type: " + type
);
491 static private Field
.Index
TypeToIndexInstruction (PropertyType type
)
494 case PropertyType
.Text
: return Field
.Index
.TOKENIZED
; // Full analysis
495 case PropertyType
.Keyword
: return Field
.Index
.TOKENIZED
; // Lowercases keywords
496 case PropertyType
.Date
: return Field
.Index
.NO_NORMS
; // Do nothing
499 throw new Exception ("Bad property type: " + type
);
502 // Exposing this is a little bit suspicious.
503 static protected string PropertyToFieldName (PropertyType type
, string key
)
505 return String
.Format ("prop:{0}:{1}", TypeToCode (type
), key
);
509 static private void AddDateFields (string field_name
, Property prop
, Document doc
)
511 DateTime dt
= StringFu
.StringToDateTime (prop
.Value
);
514 f
= new Field ("YM:" + field_name
,
515 StringFu
.DateTimeToYearMonthString (dt
),
517 Field
.Index
.NO_NORMS
);
520 f
= new Field ("D:" + field_name
,
521 StringFu
.DateTimeToDayString (dt
),
523 Field
.Index
.NO_NORMS
);
527 static protected void AddPropertyToDocument (Property prop
, Document doc
)
529 if (prop
== null || prop
.Value
== null || prop
.Value
== String
.Empty
)
532 // Don't actually put properties in the UnindexedNamespace
533 // in the document. A horrible (and yet lovely!) hack.
534 if (prop
.Key
.StartsWith (StringFu
.UnindexedNamespace
))
539 if (prop
.IsSearched
) {
540 string wildcard_field
= TypeToWildcardField (prop
.Type
);
542 f
= new Field (wildcard_field
,
545 TypeToIndexInstruction (prop
.Type
));
547 // We don't want to include norms for non-text
548 // fields, even if we do tokenize them.
549 if (prop
.Type
== PropertyType
.Keyword
|| prop
.Type
== PropertyType
.Date
)
550 f
.SetOmitNorms (true);
554 if (prop
.Type
== PropertyType
.Date
)
555 AddDateFields (wildcard_field
, prop
, doc
);
559 coded_value
= String
.Format ("{0}:{1}",
560 prop
.IsSearched
? 's' : '_',
563 string field_name
= PropertyToFieldName (prop
.Type
, prop
.Key
);
565 f
= new Field (field_name
,
567 prop
.IsStored
? Field
.Store
.YES
: Field
.Store
.NO
,
568 Field
.Index
.TOKENIZED
);
571 if (prop
.Type
== PropertyType
.Date
)
572 AddDateFields (field_name
, prop
, doc
);
575 static protected Property
GetPropertyFromDocument (Field f
, Document doc
, bool from_primary_index
)
577 // Note: we don't use the document that we pass in,
578 // but in theory we could. At some later point we
579 // might need to split a property's data across two or
580 // more fields in the document.
586 field_name
= f
.Name ();
587 if (field_name
.Length
< 7
588 || ! field_name
.StartsWith ("prop:"))
592 field_value
= f
.StringValue ();
595 prop
= new Property ();
596 prop
.Type
= CodeToType (field_name
[5]);
597 prop
.Key
= field_name
.Substring (7);
598 prop
.Value
= field_value
.Substring (2);
599 prop
.IsSearched
= (field_value
[0] == 's');
600 prop
.IsMutable
= ! from_primary_index
;
601 prop
.IsStored
= f
.IsStored ();
606 //////////////////////////////////////////////////////////////////////////////
609 // Dealing with documents
612 static protected void BuildDocuments (Indexable indexable
,
613 out Document primary_doc
,
614 out Document secondary_doc
)
616 primary_doc
= new Document ();
617 secondary_doc
= null;
621 f
= new Field ("Uri", UriFu
.UriToEscapedString (indexable
.Uri
),
622 Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
625 if (indexable
.ParentUri
!= null) {
626 f
= new Field ("ParentUri", UriFu
.UriToEscapedString (indexable
.ParentUri
),
627 Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
631 if (indexable
.ValidTimestamp
) {
632 // Note that we also want to search in the
633 // Timestamp field when we do a wildcard date
634 // query, so that's why we also add a wildcard
635 // field for each item here.
637 string wildcard_field
= TypeToWildcardField (PropertyType
.Date
);
639 string str
= StringFu
.DateTimeToString (indexable
.Timestamp
);
640 f
= new Field ("Timestamp", str
, Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
642 f
= new Field (wildcard_field
, str
, Field
.Store
.NO
, Field
.Index
.NO_NORMS
);
645 // Create an inverted timestamp so that we can
646 // sort by timestamp at search-time.
647 long timeval
= Convert
.ToInt64 (str
);
648 f
= new Field ("InvertedTimestamp", (Int64
.MaxValue
- timeval
).ToString (),
649 Field
.Store
.NO
, Field
.Index
.NO_NORMS
);
652 str
= StringFu
.DateTimeToYearMonthString (indexable
.Timestamp
);
653 f
= new Field ("YM:Timestamp", str
, Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
655 f
= new Field ("YM:" + wildcard_field
, str
,
656 Field
.Store
.NO
, Field
.Index
.NO_NORMS
);
659 str
= StringFu
.DateTimeToDayString (indexable
.Timestamp
);
660 f
= new Field ("D:Timestamp", str
, Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
662 f
= new Field ("D:" + wildcard_field
, str
,
663 Field
.Store
.NO
, Field
.Index
.NO_NORMS
);
667 if (indexable
.NoContent
) {
668 // If there is no content, make a note of that
669 // in a special property.
671 prop
= Property
.NewBool ("beagle:NoContent", true);
672 AddPropertyToDocument (prop
, primary_doc
);
676 // Since we might have content, add our text
681 reader
= indexable
.GetTextReader ();
682 if (reader
!= null) {
683 f
= new Field ("Text", reader
);
687 reader
= indexable
.GetHotTextReader ();
688 if (reader
!= null) {
689 f
= new Field ("HotText", reader
);
694 // Store the Type and MimeType in special properties
696 if (indexable
.HitType
!= null) {
698 prop
= Property
.NewUnsearched ("beagle:HitType", indexable
.HitType
);
699 AddPropertyToDocument (prop
, primary_doc
);
702 if (indexable
.MimeType
!= null) {
704 prop
= Property
.NewUnsearched ("beagle:MimeType", indexable
.MimeType
);
705 AddPropertyToDocument (prop
, primary_doc
);
708 if (indexable
.Source
!= null) {
710 prop
= Property
.NewUnsearched ("beagle:Source", indexable
.Source
);
711 AddPropertyToDocument (prop
, primary_doc
);
716 prop
= Property
.NewBool (Property
.IsChildPropKey
, indexable
.IsChild
);
717 AddPropertyToDocument (prop
, primary_doc
);
720 // Store the other properties
722 foreach (Property prop
in indexable
.Properties
) {
723 Document target_doc
= primary_doc
;
724 if (prop
.IsMutable
) {
725 if (secondary_doc
== null)
726 secondary_doc
= CreateSecondaryDocument (indexable
.Uri
, indexable
.ParentUri
);
728 target_doc
= secondary_doc
;
731 AddPropertyToDocument (prop
, target_doc
);
735 static private Document
CreateSecondaryDocument (Uri uri
, Uri parent_uri
)
737 Document secondary_doc
= new Document ();
739 Field f
= new Field ("Uri", UriFu
.UriToEscapedString (uri
), Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
740 secondary_doc
.Add (f
);
742 if (parent_uri
!= null) {
743 // Store both Uri and ParentUri in secondary index for easy removal
744 f
= new Field ("ParentUri", UriFu
.UriToEscapedString (parent_uri
), Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
745 secondary_doc
.Add (f
);
748 return secondary_doc
;
751 static protected Document
RewriteDocument (Document old_secondary_doc
,
752 Indexable prop_only_indexable
)
754 Hashtable seen_props
;
755 seen_props
= new Hashtable ();
758 new_doc
= new Document ();
761 uri_f
= new Field ("Uri", UriFu
.UriToEscapedString (prop_only_indexable
.Uri
), Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
764 Logger
.Log
.Debug ("Rewriting {0}", prop_only_indexable
.DisplayUri
);
766 if (prop_only_indexable
.ParentUri
!= null) {
767 uri_f
= new Field ("ParentUri", UriFu
.UriToEscapedString (prop_only_indexable
.ParentUri
), Field
.Store
.YES
, Field
.Index
.NO_NORMS
);
769 Logger
.Log
.Debug ("Parent Uri {0}", prop_only_indexable
.ParentUri
);
772 // Add the new properties to the new document. To
773 // delete a property, set the Value to null... then it
774 // will be added to seen_props (so the old value will
775 // be ignored below), but AddPropertyToDocument will
776 // return w/o doing anything.
777 foreach (Property prop
in prop_only_indexable
.Properties
) {
778 seen_props
[prop
.Key
] = prop
;
780 // Don't add properties that are empty; they
781 // essentially mean "reset this property"
782 if (prop
.Value
== String
.Empty
) {
783 Logger
.Log
.Debug ("Resetting prop '{0}'", prop
.Key
);
787 AddPropertyToDocument (prop
, new_doc
);
788 Logger
.Log
.Debug ("New prop '{0}' = '{1}'", prop
.Key
, prop
.Value
);
791 // Copy the other properties from the old document to the
792 // new one, skipping any properties that we got new values
793 // for out of the Indexable.
794 if (old_secondary_doc
!= null) {
795 foreach (Field f
in old_secondary_doc
.Fields ()) {
797 prop
= GetPropertyFromDocument (f
, old_secondary_doc
, false);
798 if (prop
!= null && ! seen_props
.Contains (prop
.Key
)) {
799 Logger
.Log
.Debug ("Old prop '{0}' = '{1}'", prop
.Key
, prop
.Value
);
800 AddPropertyToDocument (prop
, new_doc
);
808 static protected Uri
GetUriFromDocument (Document doc
)
811 uri
= doc
.Get ("Uri");
813 throw new Exception ("Got document from Lucene w/o a URI!");
814 return UriFu
.EscapedStringToUri (uri
);
817 static protected Hit
DocumentToHit (Document doc
)
822 hit
.Uri
= GetUriFromDocument (doc
);
825 str
= doc
.Get ("ParentUri");
827 hit
.ParentUri
= UriFu
.EscapedStringToUri (str
);
829 hit
.Timestamp
= StringFu
.StringToDateTime (doc
.Get ("Timestamp"));
831 AddPropertiesToHit (hit
, doc
, true);
833 // Get the Type and MimeType from the properties.
834 hit
.Type
= hit
.GetFirstProperty ("beagle:HitType");
835 hit
.MimeType
= hit
.GetFirstProperty ("beagle:MimeType");
836 hit
.Source
= hit
.GetFirstProperty ("beagle:Source");
841 static protected void AddPropertiesToHit (Hit hit
, Document doc
, bool from_primary_index
)
843 foreach (Field f
in doc
.Fields ()) {
845 prop
= GetPropertyFromDocument (f
, doc
, from_primary_index
);
847 hit
.AddProperty (prop
);
852 //////////////////////////////////////////////////////////////////////////////
855 // Handle the index's item count
858 public int GetItemCount ()
860 if (last_item_count
< 0) {
862 reader
= GetReader (PrimaryStore
);
863 last_item_count
= reader
.NumDocs ();
864 ReleaseReader (reader
);
866 return last_item_count
;
869 // We should set the cached count of index items when IndexReaders
870 // are open and available, so calls to GetItemCount will return immediately.
872 protected bool HaveItemCount { get { return last_item_count >= 0; }
}
874 protected void SetItemCount (IndexReader reader
)
876 last_item_count
= reader
.NumDocs ();
879 public void SetItemCount (int count
)
881 last_item_count
= count
;
884 protected void AdjustItemCount (int delta
)
886 if (last_item_count
>= 0)
887 last_item_count
+= delta
;
890 //////////////////////////////////////////////////////////////////////////////
893 // Access to the stemmer and list of stop words
896 static PorterStemmer stemmer
= new PorterStemmer ();
898 static public string Stem (string str
)
900 return stemmer
.Stem (str
);
903 public static bool IsStopWord (string stemmed_word
)
905 return ArrayFu
.IndexOfString (StopAnalyzer
.ENGLISH_STOP_WORDS
, stemmed_word
) != -1;
908 //////////////////////////////////////////////////////////////////////////////
911 // Special Hit Filtering classes
914 static private bool TrueHitFilter (Hit hit
)
919 static private HitFilter true_hit_filter
= new HitFilter (TrueHitFilter
);
921 public class OrHitFilter
{
923 private ArrayList all
= new ArrayList ();
924 private bool contains_known_true
= false;
926 public void Add (HitFilter hit_filter
)
928 if (hit_filter
== true_hit_filter
)
929 contains_known_true
= true;
930 all
.Add (hit_filter
);
933 public bool HitFilter (Hit hit
)
935 if (contains_known_true
)
937 foreach (HitFilter hit_filter
in all
)
938 if (hit_filter (hit
))
944 public class AndHitFilter
{
946 private ArrayList all
= new ArrayList ();
948 public void Add (HitFilter hit_filter
)
950 all
.Add (hit_filter
);
953 public bool HitFilter (Hit hit
)
955 foreach (HitFilter hit_filter
in all
)
956 if (! hit_filter (hit
))
962 public class NotHitFilter
{
965 public NotHitFilter (HitFilter original
)
967 this.original
= original
;
970 public bool HitFilter (Hit hit
)
972 return ! original (hit
);
976 //////////////////////////////////////////////////////////////////////////////
982 static private LNS
.Query
StringToQuery (string field_name
,
986 ArrayList tokens
= new ArrayList ();
988 // Use the analyzer to extract the query's tokens.
989 // This code is taken from Lucene's query parser.
990 TokenStream source
= QueryAnalyzer
.TokenStream (field_name
, new StringReader (text
));
992 Lucene
.Net
.Analysis
.Token token
;
994 token
= source
.Next ();
997 } catch (IOException
) {
1001 tokens
.Add (token
.TermText ());
1005 } catch (IOException
) {
1009 if (tokens
.Count
== 0)
1012 LNS
.PhraseQuery query
= new LNS
.PhraseQuery ();
1014 foreach (string token
in tokens
) {
1016 term
= new Term (field_name
, token
);
1018 if (term_list
!= null)
1019 term_list
.Add (term
);
1026 // Date Range Handling
1029 // This function will break down dates to discrete chunks of
1030 // time to avoid expanding RangeQuerys as much as possible.
1031 // For example, searching for
1033 // YMD(5 May 2005, 16 Oct 2006)
1035 // would break down into three queries:
1037 // (YM(May 2005) AND D(5,31)) OR
1038 // YM(Jun 2005, Sep 2006) OR
1039 // (YM(Oct 2006) AND D(1,16))
1041 static private DateTime lower_bound
= DateTimeUtil
.UnixToDateTimeUtc (0);
1043 // FIXME: we should probably boost this sometime around 2030.
1044 // Mark your calendar.
1045 static private DateTime upper_bound
= new DateTime (2038, 12, 31);
1047 static private Term
NewYearMonthTerm (string field_name
, int y
, int m
)
1049 return new Term ("YM:" + field_name
, String
.Format ("{0}{1:00}", y
, m
));
1052 static private LNS
.Query
NewYearMonthQuery (string field_name
, int y
, int m
)
1054 return new LNS
.TermQuery (NewYearMonthTerm (field_name
, y
, m
));
1057 static private LNS
.Query
NewYearMonthQuery (string field_name
, int y1
, int m1
, int y2
, int m2
)
1059 return new LNS
.RangeQuery (NewYearMonthTerm (field_name
, y1
, m1
),
1060 NewYearMonthTerm (field_name
, y2
, m2
),
1061 true); // query is inclusive
1064 static private Term
NewDayTerm (string field_name
, int d
)
1066 return new Term ("D:" + field_name
, String
.Format ("{0:00}", d
));
1069 static private LNS
.Query
NewDayQuery (string field_name
, int d1
, int d2
)
1071 return new LNS
.RangeQuery (NewDayTerm (field_name
, d1
),
1072 NewDayTerm (field_name
, d2
),
1073 true); // query is inclusive
1076 private class DateRangeHitFilter
{
1078 public DateTime StartDate
;
1079 public DateTime EndDate
;
1081 public bool HitFilter (Hit hit
)
1083 // First, check the Timestamp
1084 if (Key
== QueryPart_DateRange
.AllPropertiesKey
1085 || Key
== QueryPart_DateRange
.TimestampKey
) {
1088 if (StartDate
<= dt
&& dt
<= EndDate
)
1090 if (Key
== QueryPart_DateRange
.TimestampKey
)
1094 if (Key
== QueryPart_DateRange
.AllPropertiesKey
) {
1095 // Walk through all of the properties, and see if any
1096 // date properties fall inside the range.
1097 foreach (Property prop
in hit
.Properties
) {
1098 if (prop
.Type
== PropertyType
.Date
) {
1100 dt
= StringFu
.StringToDateTime (prop
.Value
);
1101 if (StartDate
<= dt
&& dt
<= EndDate
)
1107 // Walk through all of the properties with the given key,
1108 // and see if any of them fall inside of the range.
1110 values
= hit
.GetProperties (Key
);
1111 foreach (string v
in values
) {
1113 dt
= StringFu
.StringToDateTime (v
);
1114 if (StartDate
<= dt
&& dt
<= EndDate
)
1122 static private LNS
.Query
GetDateRangeQuery (QueryPart_DateRange part
, out HitFilter hit_filter
)
1125 if (part
.Key
== QueryPart_DateRange
.AllPropertiesKey
)
1126 field_name
= TypeToWildcardField (PropertyType
.Date
);
1127 else if (part
.Key
== QueryPart_DateRange
.TimestampKey
)
1128 field_name
= "Timestamp";
1130 field_name
= PropertyToFieldName (PropertyType
.Date
, part
.Key
);
1132 // FIXME: We could optimize this and reduce the size of our range
1133 // queries if we actually new the min and max date that appear in
1134 // any properties in the index. We would need to inspect the index to
1135 // determine that at start-up, and then track it as new documents
1136 // get added to the index.
1137 if (part
.StartDate
< lower_bound
)
1138 part
.StartDate
= lower_bound
;
1139 if (part
.EndDate
> upper_bound
|| part
.EndDate
== DateTime
.MinValue
)
1140 part
.EndDate
= upper_bound
;
1142 // Swap the start and end dates if they come in reversed.
1143 if (part
.StartDate
> part
.EndDate
) {
1145 swap
= part
.StartDate
;
1146 part
.StartDate
= part
.EndDate
;
1147 part
.EndDate
= swap
;
1150 // Set up our hit filter to cull out the bad dates.
1151 DateRangeHitFilter drhf
;
1152 drhf
= new DateRangeHitFilter ();
1153 drhf
.Key
= part
.Key
;
1154 drhf
.StartDate
= part
.StartDate
;
1155 drhf
.EndDate
= part
.EndDate
;
1156 hit_filter
= new HitFilter (drhf
.HitFilter
);
1158 Logger
.Log
.Debug ("Building new date range query");
1159 Logger
.Log
.Debug ("Start: {0}", part
.StartDate
);
1160 Logger
.Log
.Debug ("End: {0}", part
.EndDate
);
1162 int y1
, m1
, d1
, y2
, m2
, d2
;
1163 y1
= part
.StartDate
.Year
;
1164 m1
= part
.StartDate
.Month
;
1165 d1
= part
.StartDate
.Day
;
1166 y2
= part
.EndDate
.Year
;
1167 m2
= part
.EndDate
.Month
;
1168 d2
= part
.EndDate
.Day
;
1170 LNS
.BooleanQuery top_level_query
;
1171 top_level_query
= new LNS
.BooleanQuery ();
1173 // A special case: both the start and the end of our range fall
1174 // in the same month.
1175 if (y1
== y2
&& m1
== m2
) {
1177 ym_query
= NewYearMonthQuery (field_name
, y1
, m1
);
1179 // If our range only covers a part of the month, do a range query on the days.
1180 if (d1
!= 1 || d2
!= DateTime
.DaysInMonth (y2
, m2
)) {
1181 LNS
.BooleanQuery sub_query
;
1182 sub_query
= new LNS
.BooleanQuery ();
1183 sub_query
.Add (ym_query
, true, false);
1184 sub_query
.Add (NewDayQuery (field_name
, d1
, d2
), true, false);
1185 top_level_query
.Add (sub_query
, false, false);
1187 top_level_query
.Add (ym_query
, false, false);
1192 // Handle a partial month at the beginning of our range.
1194 LNS
.BooleanQuery sub_query
;
1195 sub_query
= new LNS
.BooleanQuery ();
1196 sub_query
.Add (NewYearMonthQuery (field_name
, y1
, m1
), true, false);
1197 sub_query
.Add (NewDayQuery (field_name
, d1
, DateTime
.DaysInMonth (y1
, m1
)), true, false);
1198 top_level_query
.Add (sub_query
, false, false);
1207 // And likewise, handle a partial month at the end of our range.
1208 if (d2
< DateTime
.DaysInMonth (y2
, m2
)) {
1209 LNS
.BooleanQuery sub_query
;
1210 sub_query
= new LNS
.BooleanQuery ();
1211 sub_query
.Add (NewYearMonthQuery (field_name
, y2
, m2
), true, false);
1212 sub_query
.Add (NewDayQuery (field_name
, 1, d2
), true, false);
1213 top_level_query
.Add (sub_query
, false, false);
1222 // Generate the query for the "middle" of our period, if it is non-empty
1223 if (y1
< y2
|| ((y1
== y2
) && m1
<= m2
))
1224 top_level_query
.Add (NewYearMonthQuery (field_name
, y1
, m1
, y2
, m2
),
1228 return top_level_query
;
1231 // search_subset_uris is a list of Uris that this search should be
1233 static protected void QueryPartToQuery (QueryPart abstract_part
,
1234 bool only_build_primary_query
,
1235 ArrayList term_list
,
1236 out LNS
.Query primary_query
,
1237 out LNS
.Query secondary_query
,
1238 out HitFilter hit_filter
)
1240 primary_query
= null;
1241 secondary_query
= null;
1243 // By default, we assume that our lucene queries will return exactly the
1244 // matching set of objects. We need to set the hit filter if further
1245 // refinement of the search results is required. (As in the case of
1246 // date range queries, for example.) We essentially have to do this
1247 // to make OR queries work correctly.
1248 hit_filter
= true_hit_filter
;
1250 // The exception is when dealing with a prohibited part. Just return
1251 // null for the hit filter in that case. This works since
1252 // prohibited parts are not allowed inside of OR queries.
1253 if (abstract_part
.Logic
== QueryPartLogic
.Prohibited
)
1256 if (abstract_part
== null)
1259 if (abstract_part
is QueryPart_Text
) {
1260 QueryPart_Text part
= (QueryPart_Text
) abstract_part
;
1262 if (! (part
.SearchFullText
|| part
.SearchTextProperties
))
1265 LNS
.BooleanQuery p_query
= new LNS
.BooleanQuery ();
1266 LNS
.BooleanQuery s_query
= new LNS
.BooleanQuery ();
1268 if (part
.SearchFullText
) {
1270 subquery
= StringToQuery ("Text", part
.Text
, term_list
);
1271 if (subquery
!= null)
1272 p_query
.Add (subquery
, false, false);
1274 // FIXME: HotText is ignored for now!
1275 // subquery = StringToQuery ("HotText", part.Text);
1276 // if (subquery != null)
1277 // p_query.Add (subquery, false, false);
1280 if (part
.SearchTextProperties
) {
1282 subquery
= StringToQuery ("PropertyText", part
.Text
, term_list
);
1283 if (subquery
!= null) {
1284 p_query
.Add (subquery
, false, false);
1285 // Properties can live in either index
1286 if (! only_build_primary_query
)
1287 s_query
.Add (subquery
.Clone () as LNS
.Query
, false, false);
1291 term
= new Term ("PropertyKeyword", part
.Text
.ToLower ()); // make sure text is lowercased
1292 // FIXME: terms are already added in term_list. But they may have been tokenized
1293 // The term here is non-tokenized version. Should this be added to term_list ?
1294 // term_list is used to calculate scores
1295 if (term_list
!= null)
1296 term_list
.Add (term
);
1297 subquery
= new LNS
.TermQuery (term
);
1298 p_query
.Add (subquery
, false, false);
1299 // Properties can live in either index
1300 if (! only_build_primary_query
)
1301 s_query
.Add (subquery
.Clone () as LNS
.Query
, false, false);
1304 primary_query
= p_query
;
1305 if (! only_build_primary_query
)
1306 secondary_query
= s_query
;
1311 if (abstract_part
is QueryPart_Wildcard
) {
1312 QueryPart_Wildcard part
= (QueryPart_Wildcard
) abstract_part
;
1314 LNS
.BooleanQuery p_query
= new LNS
.BooleanQuery ();
1315 LNS
.BooleanQuery s_query
= new LNS
.BooleanQuery ();
1320 // Lower case the terms for searching
1321 string query_string_lower
= part
.QueryString
.ToLower ();
1323 // Search text content
1324 term
= new Term ("Text", query_string_lower
);
1325 subquery
= new LNS
.WildcardQuery (term
);
1326 p_query
.Add (subquery
, false, false);
1327 term_list
.Add (term
);
1329 // Search text properties
1330 term
= new Term ("PropertyText", query_string_lower
);
1331 subquery
= new LNS
.WildcardQuery (term
);
1332 p_query
.Add (subquery
, false, false);
1333 // Properties can live in either index
1334 if (! only_build_primary_query
)
1335 s_query
.Add (subquery
.Clone () as LNS
.Query
, false, false);
1336 term_list
.Add (term
);
1338 // Search property keywords
1339 term
= new Term ("PropertyKeyword", query_string_lower
);
1340 term_list
.Add (term
);
1341 subquery
= new LNS
.WildcardQuery (term
);
1342 p_query
.Add (subquery
, false, false);
1343 // Properties can live in either index
1344 if (! only_build_primary_query
)
1345 s_query
.Add (subquery
.Clone () as LNS
.Query
, false, false);
1347 primary_query
= p_query
;
1348 if (! only_build_primary_query
)
1349 secondary_query
= s_query
;
1354 if (abstract_part
is QueryPart_Property
) {
1355 QueryPart_Property part
= (QueryPart_Property
) abstract_part
;
1358 if (part
.Key
== QueryPart_Property
.AllProperties
)
1359 field_name
= TypeToWildcardField (part
.Type
);
1361 field_name
= PropertyToFieldName (part
.Type
, part
.Key
);
1363 if (part
.Type
== PropertyType
.Text
)
1364 primary_query
= StringToQuery (field_name
, part
.Value
, term_list
);
1367 term
= new Term (field_name
, part
.Value
.ToLower ());
1368 if (term_list
!= null)
1369 term_list
.Add (term
);
1370 primary_query
= new LNS
.TermQuery (term
);
1373 // Properties can live in either index
1374 if (! only_build_primary_query
&& primary_query
!= null)
1375 secondary_query
= primary_query
.Clone () as LNS
.Query
;
1380 if (abstract_part
is QueryPart_DateRange
) {
1382 QueryPart_DateRange part
= (QueryPart_DateRange
) abstract_part
;
1384 primary_query
= GetDateRangeQuery (part
, out hit_filter
);
1385 // Date properties can live in either index
1386 if (! only_build_primary_query
&& primary_query
!= null)
1387 secondary_query
= primary_query
.Clone () as LNS
.Query
;
1389 // If this is a prohibited part, invert our hit filter.
1390 if (part
.Logic
== QueryPartLogic
.Prohibited
) {
1392 nhf
= new NotHitFilter (hit_filter
);
1393 hit_filter
= new HitFilter (nhf
.HitFilter
);
1399 if (abstract_part
is QueryPart_Or
) {
1400 QueryPart_Or part
= (QueryPart_Or
) abstract_part
;
1402 // Assemble a new BooleanQuery combining all of the sub-parts.
1403 LNS
.BooleanQuery p_query
;
1404 p_query
= new LNS
.BooleanQuery ();
1406 LNS
.BooleanQuery s_query
= null;
1407 if (! only_build_primary_query
)
1408 s_query
= new LNS
.BooleanQuery ();
1410 primary_query
= p_query
;
1411 secondary_query
= s_query
;
1413 OrHitFilter or_hit_filter
= null;
1415 foreach (QueryPart sub_part
in part
.SubParts
) {
1416 LNS
.Query p_subq
, s_subq
;
1417 HitFilter sub_hit_filter
; // FIXME: This is (and must be) ignored
1418 // FIXME: Any subpart in an OR which has a hit filter won't work
1419 // correctly, because we can't tell which part of an OR we matched
1420 // against to filter correctly. This affects date range queries.
1421 QueryPartToQuery (sub_part
, only_build_primary_query
,
1423 out p_subq
, out s_subq
, out sub_hit_filter
);
1425 p_query
.Add (p_subq
, false, false);
1427 s_query
.Add (s_subq
, false, false);
1428 if (sub_hit_filter
!= null) {
1429 if (or_hit_filter
== null)
1430 or_hit_filter
= new OrHitFilter ();
1431 or_hit_filter
.Add (sub_hit_filter
);
1435 if (or_hit_filter
!= null)
1436 hit_filter
= new HitFilter (or_hit_filter
.HitFilter
);
1441 throw new Exception ("Unhandled QueryPart type! " + abstract_part
.ToString ());
1444 static protected LNS
.Query
UriQuery (string field_name
, Uri uri
)
1446 return new LNS
.TermQuery (new Term (field_name
, UriFu
.UriToEscapedString (uri
)));
1449 static protected LNS
.Query
UriQuery (string field_name
, ICollection uri_list
)
1451 return UriQuery (field_name
, uri_list
, null);
1454 static protected LNS
.Query
UriQuery (string field_name
, ICollection uri_list
, LNS
.Query extra_requirement
)
1456 if (uri_list
.Count
== 0)
1460 max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
1463 N
= 1 + (uri_list
.Count
- 1) / max_clauses
;
1465 LNS
.BooleanQuery top_query
;
1466 top_query
= new LNS
.BooleanQuery ();
1469 if (extra_requirement
!= null) {
1470 top_query
.Add (extra_requirement
, true, false);
1474 ArrayList bottom_queries
= null;
1477 bottom_queries
= new ArrayList ();
1478 for (int i
= 0; i
< N
; ++i
) {
1479 LNS
.BooleanQuery bq
;
1480 bq
= new LNS
.BooleanQuery ();
1481 bottom_queries
.Add (bq
);
1482 top_query
.Add (bq
, false, false);
1486 foreach (Uri uri
in uri_list
) {
1488 subquery
= UriQuery (field_name
, uri
);
1490 LNS
.BooleanQuery target
;
1494 target
= (LNS
.BooleanQuery
) bottom_queries
[cursor
];
1500 target
.Add (subquery
, false, false);
1506 ///////////////////////////////////////////////////////////////////////////////////
1508 public int SegmentCount
{
1510 DirectoryInfo dir_info
;
1511 int p_count
= 0, s_count
= 0;
1513 dir_info
= new DirectoryInfo (PrimaryIndexDirectory
);
1514 foreach (FileInfo file_info
in dir_info
.GetFiles ())
1515 if (file_info
.Extension
== ".cfs")
1518 dir_info
= new DirectoryInfo (SecondaryIndexDirectory
);
1519 foreach (FileInfo file_info
in dir_info
.GetFiles ())
1520 if (file_info
.Extension
== ".cfs")
1523 return p_count
> s_count
? p_count
: s_count
;
1527 ///////////////////////////////////////////////////////////////////////////////////
1529 // Cache IndexReaders on a per-Lucene index basis, since they
1530 // are extremely expensive to create. Note that using this
1531 // only makes sense in situations where the index only
1532 // possibly might change from underneath us, but most of the
1533 // time probably won't. This means it makes sense to do
1534 // this in LuceneQueryingDriver.cs, but it doesn't in
1535 // LuceneIndexingDriver.cs.
1537 private class ReaderAndVersion
{
1539 public IndexReader Reader
;
1540 public long Version
;
1541 public int Refcount
;
1543 public ReaderAndVersion (IndexReader reader
, long version
)
1545 this.Reader
= reader
;
1546 this.Version
= version
;
1551 static private Hashtable directory_rav_map
= new Hashtable ();
1552 static private Hashtable reader_rav_map
= new Hashtable ();
1554 static public LNS
.IndexSearcher
GetSearcher (Lucene
.Net
.Store
.Directory directory
)
1556 IndexReader reader
= GetReader (directory
);
1558 return new LNS
.IndexSearcher (reader
);
1561 static public IndexReader
GetReader (Lucene
.Net
.Store
.Directory directory
)
1566 lock (reader_rav_map
) {
1567 ReaderAndVersion rav
= (ReaderAndVersion
) directory_rav_map
[directory
];
1570 version
= IndexReader
.GetCurrentVersion (directory
);
1571 reader
= IndexReader
.Open (directory
);
1573 rav
= new ReaderAndVersion (reader
, version
);
1575 directory_rav_map
[directory
] = rav
;
1576 reader_rav_map
[reader
] = rav
;
1581 version
= IndexReader
.GetCurrentVersion (directory
);
1583 if (version
!= rav
.Version
) {
1584 reader
= IndexReader
.Open (directory
);
1586 rav
= new ReaderAndVersion (reader
, version
);
1588 directory_rav_map
[directory
] = rav
;
1589 reader_rav_map
[reader
] = rav
;
1598 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav
)
1602 if (rav
.Refcount
== 0) {
1603 reader_rav_map
.Remove (rav
.Reader
);
1604 directory_rav_map
.Remove (rav
.Reader
.Directory ());
1605 rav
.Reader
.Close ();
1609 static public void ReleaseReader (IndexReader reader
)
1611 lock (reader_rav_map
) {
1612 ReaderAndVersion rav
= (ReaderAndVersion
) reader_rav_map
[reader
];
1615 UnrefReaderAndVersion_Unlocked (rav
);
1621 static public void ReleaseSearcher (LNS
.IndexSearcher searcher
)
1623 IndexReader reader
= searcher
.GetIndexReader ();
1626 ReleaseReader (reader
);
1629 ///////////////////////////////////////////////////////////////////////////////////
1632 // Various ways to grab lots of hits at once.
1633 // These should never be used for querying, only for utility
1637 public int GetBlockOfHits (int cookie
,
1638 Hit
[] block_of_hits
)
1640 IndexReader primary_reader
;
1641 IndexReader secondary_reader
;
1642 primary_reader
= GetReader (PrimaryStore
);
1643 secondary_reader
= GetReader (SecondaryStore
);
1646 request_size
= block_of_hits
.Length
;
1647 if (request_size
> primary_reader
.NumDocs ())
1648 request_size
= primary_reader
.NumDocs ();
1651 max_doc
= primary_reader
.MaxDoc ();
1655 random
= new Random ();
1656 cookie
= random
.Next (max_doc
);
1659 int original_cookie
;
1660 original_cookie
= cookie
;
1662 Hashtable primary_docs
, secondary_docs
;
1663 primary_docs
= UriFu
.NewHashtable ();
1664 secondary_docs
= UriFu
.NewHashtable ();
1666 // Load the primary documents
1667 for (int i
= 0; i
< request_size
; ++i
) {
1669 if (! primary_reader
.IsDeleted (cookie
)) {
1671 doc
= primary_reader
.Document (cookie
);
1672 primary_docs
[GetUriFromDocument (doc
)] = doc
;
1676 if (cookie
>= max_doc
) // wrap around
1679 // If we somehow end up back where we started,
1681 if (cookie
== original_cookie
)
1685 // If necessary, load the secondary documents
1686 if (secondary_reader
!= null) {
1687 LNS
.IndexSearcher searcher
;
1688 searcher
= new LNS
.IndexSearcher (secondary_reader
);
1690 LNS
.Query uri_query
;
1691 uri_query
= UriQuery ("Uri", primary_docs
.Keys
);
1694 hits
= searcher
.Search (uri_query
);
1695 for (int i
= 0; i
< hits
.Length (); ++i
) {
1698 secondary_docs
[GetUriFromDocument (doc
)] = doc
;
1704 ReleaseReader (primary_reader
);
1705 ReleaseReader (secondary_reader
);
1707 // Now assemble the hits
1709 foreach (Uri uri
in primary_docs
.Keys
) {
1710 Document primary_doc
, secondary_doc
;
1711 primary_doc
= primary_docs
[uri
] as Document
;
1712 secondary_doc
= secondary_docs
[uri
] as Document
;
1715 hit
= DocumentToHit (primary_doc
);
1716 if (secondary_doc
!= null)
1717 AddPropertiesToHit (hit
, secondary_doc
, false);
1719 block_of_hits
[j
] = hit
;
1723 // null-pad the array, if necessary
1724 for (; j
< block_of_hits
.Length
; ++j
)
1725 block_of_hits
[j
] = null;
1728 // Return the new cookie
1732 // For a large index, this will be very slow and will consume
1733 // a lot of memory. Don't call it without a good reason!
1734 // We return a hashtable indexed by Uri.
1735 public Hashtable
GetAllHitsByUri ()
1738 all_hits
= UriFu
.NewHashtable ();
1740 IndexReader primary_reader
;
1741 IndexReader secondary_reader
;
1742 primary_reader
= GetReader (PrimaryStore
);
1743 secondary_reader
= GetReader (SecondaryStore
);
1745 // Load everything from the primary index
1747 max_doc
= primary_reader
.MaxDoc ();
1748 for (int i
= 0; i
< max_doc
; ++i
) {
1750 if (primary_reader
.IsDeleted (i
))
1754 doc
= primary_reader
.Document (i
);
1757 hit
= DocumentToHit (doc
);
1758 all_hits
[hit
.Uri
] = hit
;
1761 // Now add in everything from the secondary index, if it exists
1762 if (secondary_reader
!= null) {
1763 max_doc
= secondary_reader
.MaxDoc ();
1764 for (int i
= 0; i
< max_doc
; ++i
) {
1766 if (secondary_reader
.IsDeleted (i
))
1770 doc
= secondary_reader
.Document (i
);
1773 uri
= GetUriFromDocument (doc
);
1776 hit
= (Hit
) all_hits
[uri
];
1778 AddPropertiesToHit (hit
, doc
, false);
1782 ReleaseReader (primary_reader
);
1783 ReleaseReader (secondary_reader
);