2 // LuceneQueryingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
29 using System
.Diagnostics
;
30 using System
.Globalization
;
33 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Analysis
;
38 using Lucene
.Net
.Analysis
.Standard
;
39 using Lucene
.Net
.Documents
;
40 using Lucene
.Net
.Index
;
41 using Lucene
.Net
.QueryParsers
;
42 using LNS
= Lucene
.Net
.Search
;
46 namespace Beagle
.Daemon
{
48 public class LuceneQueryingDriver
: LuceneCommon
{
50 static public bool Debug
= false;
52 public const string PrivateNamespace
= "_private:";
54 public delegate bool UriFilter (Uri uri
);
55 public delegate double RelevancyMultiplier (Hit hit
);
57 public LuceneQueryingDriver (string index_name
, int minor_version
, bool read_only
)
58 : base (index_name
, minor_version
)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
66 // We're in read-only mode, but we can't create an index.
67 // Maybe a different exception would be better? This one is caught
68 // in QueryDriver.LoadStaticQueryable ()
69 throw new InvalidOperationException ();
72 // Initialize the user text cache only if we're not in
73 // read-only mode. StaticQueryables instantiate their
74 // own text caches that are stored in a separate
77 text_cache
= TextCache
.UserCache
;
80 ////////////////////////////////////////////////////////////////
83 ////////////////////////////////////////////////////////////////
85 public Uri
[] PropertyQuery (Property prop
)
87 // FIXME: Should we support scanning the secondary
90 IndexReader primary_reader
;
91 LNS
.IndexSearcher primary_searcher
;
93 primary_reader
= LuceneCommon
.GetReader (PrimaryStore
);
94 primary_searcher
= new LNS
.IndexSearcher (primary_reader
);
96 Term term
= new Term (PropertyToFieldName (prop
.Type
, prop
.Key
), prop
.Value
);
97 LNS
.TermQuery query
= new LNS
.TermQuery (term
);
98 LNS
.Hits hits
= primary_searcher
.Search (query
);
100 Uri
[] uri_list
= new Uri
[hits
.Length ()];
101 for (int i
= 0; i
< hits
.Length (); i
++) {
104 uri_list
[i
] = GetUriFromDocument (doc
);
107 primary_searcher
.Close ();
108 ReleaseReader (primary_reader
);
113 ////////////////////////////////////////////////////////////////
115 // Returns the lowest matching score before the results are
117 public void DoQuery (Query query
,
119 ICollection search_subset_uris
, // should be internal uris
120 UriFilter uri_filter
,
121 HitFilter hit_filter
)
124 sw
= new Stopwatch ();
127 // Assemble all of the parts into a bunch of Lucene queries
129 ArrayList primary_required_part_queries
= null;
130 ArrayList secondary_required_part_queries
= null;
132 LNS
.BooleanQuery primary_prohibited_part_query
= null;
133 LNS
.BooleanQuery secondary_prohibited_part_query
= null;
135 AndHitFilter all_hit_filters
;
136 all_hit_filters
= new AndHitFilter ();
137 if (hit_filter
!= null)
138 all_hit_filters
.Add (hit_filter
);
140 ArrayList term_list
= new ArrayList ();
142 foreach (QueryPart part
in query
.Parts
) {
143 LNS
.Query primary_part_query
;
144 LNS
.Query secondary_part_query
;
145 HitFilter part_hit_filter
;
146 QueryPartToQuery (part
,
147 false, // we want both primary and secondary queries
148 part
.Logic
== QueryPartLogic
.Required
? term_list
: null,
149 out primary_part_query
,
150 out secondary_part_query
,
151 out part_hit_filter
);
153 if (primary_part_query
== null)
156 switch (part
.Logic
) {
158 case QueryPartLogic
.Required
:
159 if (primary_required_part_queries
== null) {
160 primary_required_part_queries
= new ArrayList ();
161 secondary_required_part_queries
= new ArrayList ();
163 primary_required_part_queries
.Add (primary_part_query
);
164 secondary_required_part_queries
.Add (secondary_part_query
);
166 if (part_hit_filter
!= null)
167 all_hit_filters
.Add (part_hit_filter
);
171 case QueryPartLogic
.Prohibited
:
172 if (primary_prohibited_part_query
== null)
173 primary_prohibited_part_query
= new LNS
.BooleanQuery ();
174 primary_prohibited_part_query
.Add (primary_part_query
, false, false);
176 if (secondary_part_query
!= null) {
177 if (secondary_prohibited_part_query
== null)
178 secondary_prohibited_part_query
= new LNS
.BooleanQuery ();
179 secondary_prohibited_part_query
.Add (secondary_part_query
, false, false);
182 if (part_hit_filter
!= null) {
184 nhf
= new NotHitFilter (part_hit_filter
);
185 all_hit_filters
.Add (new HitFilter (nhf
.HitFilter
));
192 // If we have no required parts, give up.
193 if (primary_required_part_queries
== null)
197 // Now that we have all of these nice queries, let's execute them!
200 // Create the searchers that we will need.
202 IndexReader primary_reader
;
203 LNS
.IndexSearcher primary_searcher
;
204 IndexReader secondary_reader
= null;
205 LNS
.IndexSearcher secondary_searcher
= null;
207 primary_reader
= LuceneCommon
.GetReader (PrimaryStore
);
208 primary_searcher
= new LNS
.IndexSearcher (primary_reader
);
210 if (SecondaryStore
!= null) {
211 secondary_reader
= LuceneCommon
.GetReader (SecondaryStore
);
212 if (secondary_reader
.NumDocs () == 0) {
213 ReleaseReader (secondary_reader
);
214 secondary_reader
= null;
218 if (secondary_reader
!= null)
219 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
222 // Possibly create our whitelists from the search subset.
224 LuceneBitArray primary_whitelist
= null;
225 LuceneBitArray secondary_whitelist
= null;
227 if (search_subset_uris
!= null && search_subset_uris
.Count
> 0) {
228 primary_whitelist
= new LuceneBitArray (primary_searcher
);
229 if (secondary_searcher
!= null)
230 secondary_whitelist
= new LuceneBitArray (secondary_searcher
);
232 foreach (Uri uri
in search_subset_uris
) {
233 primary_whitelist
.AddUri (uri
);
234 if (secondary_whitelist
!= null)
235 secondary_whitelist
.AddUri (uri
);
237 primary_whitelist
.FlushUris ();
238 if (secondary_whitelist
!= null)
239 secondary_whitelist
.FlushUris ();
243 // Build blacklists from our prohibited parts.
245 LuceneBitArray primary_blacklist
= null;
246 LuceneBitArray secondary_blacklist
= null;
248 if (primary_prohibited_part_query
!= null) {
249 primary_blacklist
= new LuceneBitArray (primary_searcher
,
250 primary_prohibited_part_query
);
252 if (secondary_searcher
!= null) {
253 secondary_blacklist
= new LuceneBitArray (secondary_searcher
);
254 if (secondary_prohibited_part_query
!= null)
255 secondary_blacklist
.Or (secondary_prohibited_part_query
);
256 primary_blacklist
.Join (secondary_blacklist
);
261 // Combine our whitelist and blacklist into just a whitelist.
263 if (primary_blacklist
!= null) {
264 if (primary_whitelist
== null) {
265 primary_blacklist
.Not ();
266 primary_whitelist
= primary_blacklist
;
268 primary_whitelist
.AndNot (primary_blacklist
);
272 if (secondary_blacklist
!= null) {
273 if (secondary_whitelist
== null) {
274 secondary_blacklist
.Not ();
275 secondary_whitelist
= secondary_blacklist
;
277 secondary_whitelist
.AndNot (secondary_blacklist
);
281 BetterBitArray primary_matches
= null;
283 if (primary_required_part_queries
!= null) {
285 if (secondary_searcher
!= null)
286 primary_matches
= DoRequiredQueries_TwoIndex (primary_searcher
,
288 primary_required_part_queries
,
289 secondary_required_part_queries
,
291 secondary_whitelist
);
293 primary_matches
= DoRequiredQueries (primary_searcher
,
294 primary_required_part_queries
,
301 Logger
.Log
.Debug ("###### Finished low-level queries in {0}", sw
);
305 // Only generate results if we got some matches
306 if (primary_matches
!= null && primary_matches
.ContainsTrue ()) {
307 GenerateQueryResults (primary_reader
,
315 new HitFilter (all_hit_filters
.HitFilter
));
319 // Finally, we clean up after ourselves.
322 primary_searcher
.Close ();
323 if (secondary_searcher
!= null)
324 secondary_searcher
.Close ();
325 ReleaseReader (primary_reader
);
326 if (secondary_reader
!= null)
327 ReleaseReader (secondary_reader
);
332 Logger
.Log
.Debug ("###### Processed query in {0}", sw
);
336 ////////////////////////////////////////////////////////////////
339 // Special logic for handling our set of required queries
342 // This is the easy case: we just combine all of the queries
343 // into one big BooleanQuery.
344 private static BetterBitArray
DoRequiredQueries (LNS
.IndexSearcher primary_searcher
,
345 ArrayList primary_queries
,
346 BetterBitArray primary_whitelist
)
348 LNS
.BooleanQuery combined_query
;
349 combined_query
= new LNS
.BooleanQuery ();
350 foreach (LNS
.Query query
in primary_queries
)
351 combined_query
.Add (query
, true, false);
353 LuceneBitArray matches
;
354 matches
= new LuceneBitArray (primary_searcher
, combined_query
);
355 if (primary_whitelist
!= null)
356 matches
.And (primary_whitelist
);
361 // This code attempts to execute N required queries in the
362 // most efficient order to minimize the amount of time spent
363 // joining between the two indexes. It returns a joined bit
364 // array of matches against the primary index.
366 private class MatchInfo
: IComparable
{
368 public LuceneBitArray PrimaryMatches
= null;
369 public LuceneBitArray SecondaryMatches
= null;
370 public int UpperBound
= 0;
374 PrimaryMatches
.Join (SecondaryMatches
);
377 public void RestrictBy (MatchInfo joined
)
379 if (joined
!= null) {
380 this.PrimaryMatches
.And (joined
.PrimaryMatches
);
381 this.SecondaryMatches
.And (joined
.SecondaryMatches
);
385 UpperBound
+= PrimaryMatches
.TrueCount
;
386 UpperBound
+= SecondaryMatches
.TrueCount
;
389 public int CompareTo (object obj
)
391 MatchInfo other
= (MatchInfo
) obj
;
392 return this.UpperBound
- other
.UpperBound
;
396 // Any whitelists that are passed in must be fully joined, or
397 // query results will be incorrect.
398 private static BetterBitArray
DoRequiredQueries_TwoIndex (LNS
.IndexSearcher primary_searcher
,
399 LNS
.IndexSearcher secondary_searcher
,
400 ArrayList primary_queries
,
401 ArrayList secondary_queries
,
402 BetterBitArray primary_whitelist
,
403 BetterBitArray secondary_whitelist
)
405 ArrayList match_info_list
;
406 match_info_list
= new ArrayList ();
408 // First, do all of the low-level queries
409 // and store them in our MatchInfo
410 for (int i
= 0; i
< primary_queries
.Count
; ++i
) {
412 pq
= primary_queries
[i
] as LNS
.Query
;
413 sq
= secondary_queries
[i
] as LNS
.Query
;
415 LuceneBitArray p_matches
= null, s_matches
= null;
416 p_matches
= new LuceneBitArray (primary_searcher
);
419 if (primary_whitelist
!= null)
420 p_matches
.And (primary_whitelist
);
423 s_matches
= new LuceneBitArray (secondary_searcher
);
426 if (secondary_whitelist
!= null)
427 s_matches
.And (secondary_whitelist
);
431 info
= new MatchInfo ();
432 info
.PrimaryMatches
= p_matches
;
433 info
.SecondaryMatches
= s_matches
;
434 info
.RestrictBy (null); // a hack to initialize the UpperBound
435 match_info_list
.Add (info
);
438 // We want to be smart about the order we do this in,
439 // to minimize the expense of the Join.
440 while (match_info_list
.Count
> 1) {
442 // FIXME: We don't really need to sort here, it would
443 // be sufficient to just find the minimal element.
444 match_info_list
.Sort ();
446 smallest
= match_info_list
[0] as MatchInfo
;
447 match_info_list
.RemoveAt (0);
449 // We can short-circuit if our smallest set of
451 if (smallest
.UpperBound
== 0)
452 return smallest
.PrimaryMatches
; // this must be an empty array.
456 foreach (MatchInfo info
in match_info_list
)
457 info
.RestrictBy (smallest
);
460 // For the final pair, we don't need to do a full join:
461 // mapping the secondary onto the primary is sufficient
463 last
= match_info_list
[0] as MatchInfo
;
464 last
.SecondaryMatches
.ProjectOnto (last
.PrimaryMatches
);
466 return last
.PrimaryMatches
;
469 ////////////////////////////////////////////////////////////////
471 static private void ScoreHits (Hashtable hits_by_id
,
473 ICollection term_list
)
476 sw
= new Stopwatch ();
479 LNS
.Similarity similarity
;
480 similarity
= LNS
.Similarity
.GetDefault ();
482 foreach (Term term
in term_list
) {
485 idf
= similarity
.Idf (reader
.DocFreq (term
), reader
.MaxDoc ());
488 hit_count
= hits_by_id
.Count
;
491 term_docs
= reader
.TermDocs (term
);
492 while (term_docs
.Next () && hit_count
> 0) {
495 id
= term_docs
.Doc ();
498 hit
= hits_by_id
[id
] as Hit
;
501 tf
= similarity
.Tf (term_docs
.Freq ());
502 hit
.Score
+= tf
* idf
;
511 ////////////////////////////////////////////////////////////////
513 private class DocAndId
{
519 // Given a set of hits, broadcast some set out as our query
523 private static void GenerateQueryResults (IndexReader primary_reader
,
524 LNS
.IndexSearcher primary_searcher
,
525 LNS
.IndexSearcher secondary_searcher
,
526 BetterBitArray primary_matches
,
528 ICollection query_term_list
,
530 UriFilter uri_filter
,
531 HitFilter hit_filter
)
533 TopScores top_docs
= null;
534 ArrayList all_docs
= null;
537 Logger
.Log
.Debug (">>> Initially handed {0} matches", primary_matches
.TrueCount
);
539 if (primary_matches
.TrueCount
<= max_results
) {
541 Logger
.Log
.Debug (">>> Initial count is within our limit of {0}", max_results
);
542 all_docs
= new ArrayList ();
545 Logger
.Log
.Debug (">>> Number of hits is capped at {0}", max_results
);
546 top_docs
= new TopScores (max_results
);
549 Stopwatch total
, a
, b
, c
;
550 total
= new Stopwatch ();
551 a
= new Stopwatch ();
552 b
= new Stopwatch ();
553 c
= new Stopwatch ();
558 // Pull in the primary documents.
559 // We walk across them backwards, since newer
560 // documents are more likely to be at the end of
562 int j
= primary_matches
.Count
;
565 i
= primary_matches
.GetPreviousTrueIndex (j
);
568 j
= i
-1; // This way we can't forget to adjust i
571 doc
= primary_searcher
.Doc (i
);
573 // Check the timestamp --- if we have already reached our
574 // limit, we might be able to reject it immediately.
575 string timestamp_str
;
576 long timestamp_num
= 0;
578 timestamp_str
= doc
.Get ("Timestamp");
579 if (timestamp_str
== null) {
580 Logger
.Log
.Warn ("No timestamp on {0}!", GetUriFromDocument (doc
));
582 timestamp_num
= Int64
.Parse (doc
.Get ("Timestamp"));
583 if (top_docs
!= null && ! top_docs
.WillAccept (timestamp_num
))
587 // If we have a UriFilter, apply it.
588 if (uri_filter
!= null) {
590 uri
= GetUriFromDocument (doc
);
591 if (! uri_filter (uri
))
595 DocAndId doc_and_id
= new DocAndId ();
596 doc_and_id
.Doc
= doc
;
599 // Add the document to the appropriate data structure.
600 // We use the timestamp_num as the score, so high
601 // scores correspond to more-recent timestamps.
602 if (all_docs
!= null)
603 all_docs
.Add (doc_and_id
);
605 top_docs
.Add (timestamp_num
, doc_and_id
);
612 ICollection final_list_of_docs
;
613 if (all_docs
!= null)
614 final_list_of_docs
= all_docs
;
616 final_list_of_docs
= top_docs
.TopScoringObjects
;
618 ArrayList final_list_of_hits
;
619 final_list_of_hits
= new ArrayList (final_list_of_docs
.Count
);
621 // This is used only for scoring
622 Hashtable hits_by_id
= null;
623 hits_by_id
= new Hashtable ();
625 // If we aren't using the secondary index, the next step is
626 // very straightforward.
627 if (secondary_searcher
== null) {
629 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
631 hit
= DocumentToHit (doc_and_id
.Doc
);
632 hits_by_id
[doc_and_id
.Id
] = hit
;
633 final_list_of_hits
.Add (hit
);
639 Logger
.Log
.Debug (">>> Performing cross-index Hit reunification");
641 Hashtable hits_by_uri
;
642 hits_by_uri
= UriFu
.NewHashtable ();
644 LuceneBitArray secondary_matches
;
645 secondary_matches
= new LuceneBitArray (secondary_searcher
);
647 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
649 hit
= DocumentToHit (doc_and_id
.Doc
);
650 hits_by_id
[doc_and_id
.Id
] = hit
;
651 hits_by_uri
[hit
.Uri
] = hit
;
652 secondary_matches
.AddUri (hit
.Uri
);
655 secondary_matches
.FlushUris ();
657 // Attach all of our secondary properties
662 i
= secondary_matches
.GetNextTrueIndex (j
);
663 if (i
>= secondary_matches
.Count
)
667 Document secondary_doc
;
668 secondary_doc
= secondary_searcher
.Doc (i
);
671 uri
= GetUriFromDocument (secondary_doc
);
674 hit
= hits_by_uri
[uri
] as Hit
;
676 AddPropertiesToHit (hit
, secondary_doc
, false);
678 final_list_of_hits
.Add (hit
);
682 ScoreHits (hits_by_id
, primary_reader
, query_term_list
);
686 // If we used the TopScores object, we got our original
687 // list of documents sorted for us. If not, sort the
689 if (top_docs
== null)
690 final_list_of_hits
.Sort ();
694 // If we have a hit_filter, use it now.
695 if (hit_filter
!= null) {
696 for (int i
= 0; i
< final_list_of_hits
.Count
; ++i
) {
698 hit
= final_list_of_hits
[i
] as Hit
;
699 if (! hit_filter (hit
)) {
701 Logger
.Log
.Debug ("Filtered out {0}", hit
.Uri
);
702 final_list_of_hits
[i
] = null;
707 // Before we broadcast a hit, we strip out any
708 // properties in the PrivateNamespace. We
709 // manipulate the property ArrayList directory,
710 // which is pretty gross... but this is safe,
711 // since removing items will not change the sort
713 foreach (Hit hit
in final_list_of_hits
) {
717 while (i
< hit
.Properties
.Count
) {
718 Property prop
= hit
.Properties
[i
] as Property
;
719 if (prop
.Key
.StartsWith (PrivateNamespace
))
720 hit
.Properties
.RemoveAt (i
);
726 result
.Add (final_list_of_hits
);
732 Logger
.Log
.Debug (">>> GenerateQueryResults time statistics:");
733 Logger
.Log
.Debug (">>> First pass {0} ({1:0.0}%)", a
, 100 * a
.ElapsedTime
/ total
.ElapsedTime
);
734 Logger
.Log
.Debug (">>> Hit assembly {0} ({1:0.0}%)", b
, 100 * b
.ElapsedTime
/ total
.ElapsedTime
);
735 Logger
.Log
.Debug (">>> Final pass {0} ({1:0.0}%)", c
, 100 * c
.ElapsedTime
/ total
.ElapsedTime
);
736 Logger
.Log
.Debug (">>> TOTAL {0}", total
);