2 // LuceneQueryingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
29 using System
.Diagnostics
;
30 using System
.Globalization
;
33 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Analysis
;
38 using Lucene
.Net
.Analysis
.Standard
;
39 using Lucene
.Net
.Documents
;
40 using Lucene
.Net
.Index
;
41 using Lucene
.Net
.QueryParsers
;
42 using LNS
= Lucene
.Net
.Search
;
46 namespace Beagle
.Daemon
{
48 public class LuceneQueryingDriver
: LuceneCommon
{
50 static public bool Debug
= false;
52 public const string PrivateNamespace
= "_private:";
54 public delegate bool UriFilter (Uri uri
);
55 public delegate double RelevancyMultiplier (Hit hit
);
57 public LuceneQueryingDriver (string index_name
, int minor_version
, bool read_only
)
58 : base (index_name
, minor_version
)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
66 // We're in read-only mode, but we can't create an index.
67 // Maybe a different exception would be better? This one is caught
68 // in QueryDriver.LoadStaticQueryable ()
69 throw new InvalidOperationException ();
72 // Initialize the user text cache only if we're not in
73 // read-only mode. StaticQueryables instantiate their
74 // own text caches that are stored in a separate
77 text_cache
= TextCache
.UserCache
;
80 ////////////////////////////////////////////////////////////////
83 ////////////////////////////////////////////////////////////////
85 public Uri
[] PropertyQuery (Property prop
)
87 // FIXME: Should we support scanning the secondary
90 IndexReader primary_reader
;
91 LNS
.IndexSearcher primary_searcher
;
93 primary_reader
= LuceneCommon
.GetReader (PrimaryStore
);
94 primary_searcher
= new LNS
.IndexSearcher (primary_reader
);
96 Term term
= new Term (PropertyToFieldName (prop
.Type
, prop
.Key
), prop
.Value
);
97 LNS
.TermQuery query
= new LNS
.TermQuery (term
);
98 LNS
.Hits hits
= primary_searcher
.Search (query
);
100 Uri
[] uri_list
= new Uri
[hits
.Length ()];
101 for (int i
= 0; i
< hits
.Length (); i
++) {
104 uri_list
[i
] = GetUriFromDocument (doc
);
107 primary_searcher
.Close ();
108 ReleaseReader (primary_reader
);
113 ////////////////////////////////////////////////////////////////
115 // Returns the lowest matching score before the results are
117 public void DoQuery (Query query
,
119 ICollection search_subset_uris
, // should be internal uris
120 UriFilter uri_filter
,
121 HitFilter hit_filter
)
124 sw
= new Stopwatch ();
127 // Assemble all of the parts into a bunch of Lucene queries
129 ArrayList primary_required_part_queries
= null;
130 ArrayList secondary_required_part_queries
= null;
132 LNS
.BooleanQuery primary_prohibited_part_query
= null;
133 LNS
.BooleanQuery secondary_prohibited_part_query
= null;
135 AndHitFilter all_hit_filters
;
136 all_hit_filters
= new AndHitFilter ();
137 if (hit_filter
!= null)
138 all_hit_filters
.Add (hit_filter
);
140 ArrayList term_list
= new ArrayList ();
142 foreach (QueryPart part
in query
.Parts
) {
143 LNS
.Query primary_part_query
;
144 LNS
.Query secondary_part_query
;
145 HitFilter part_hit_filter
;
146 QueryPartToQuery (part
,
147 false, // we want both primary and secondary queries
148 part
.Logic
== QueryPartLogic
.Required
? term_list
: null,
149 out primary_part_query
,
150 out secondary_part_query
,
151 out part_hit_filter
);
153 if (primary_part_query
== null)
156 switch (part
.Logic
) {
158 case QueryPartLogic
.Required
:
159 if (primary_required_part_queries
== null) {
160 primary_required_part_queries
= new ArrayList ();
161 secondary_required_part_queries
= new ArrayList ();
163 primary_required_part_queries
.Add (primary_part_query
);
164 secondary_required_part_queries
.Add (secondary_part_query
);
166 if (part_hit_filter
!= null)
167 all_hit_filters
.Add (part_hit_filter
);
171 case QueryPartLogic
.Prohibited
:
172 if (primary_prohibited_part_query
== null)
173 primary_prohibited_part_query
= new LNS
.BooleanQuery ();
174 primary_prohibited_part_query
.Add (primary_part_query
, false, false);
176 if (secondary_part_query
!= null) {
177 if (secondary_prohibited_part_query
== null)
178 secondary_prohibited_part_query
= new LNS
.BooleanQuery ();
179 secondary_prohibited_part_query
.Add (secondary_part_query
, false, false);
182 if (part_hit_filter
!= null) {
184 nhf
= new NotHitFilter (part_hit_filter
);
185 all_hit_filters
.Add (new HitFilter (nhf
.HitFilter
));
192 // If we have no required parts, give up.
193 if (primary_required_part_queries
== null)
197 // Now that we have all of these nice queries, let's execute them!
200 // Create the searchers that we will need.
202 IndexReader primary_reader
;
203 LNS
.IndexSearcher primary_searcher
;
204 IndexReader secondary_reader
= null;
205 LNS
.IndexSearcher secondary_searcher
= null;
207 primary_reader
= LuceneCommon
.GetReader (PrimaryStore
);
208 primary_searcher
= new LNS
.IndexSearcher (primary_reader
);
210 if (SecondaryStore
!= null) {
211 secondary_reader
= LuceneCommon
.GetReader (SecondaryStore
);
212 if (secondary_reader
.NumDocs () == 0) {
213 ReleaseReader (secondary_reader
);
214 secondary_reader
= null;
218 if (secondary_reader
!= null)
219 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
222 // Possibly create our whitelists from the search subset.
224 LuceneBitArray primary_whitelist
= null;
225 LuceneBitArray secondary_whitelist
= null;
227 if (search_subset_uris
!= null && search_subset_uris
.Count
> 0) {
228 primary_whitelist
= new LuceneBitArray (primary_searcher
);
229 if (secondary_searcher
!= null)
230 secondary_whitelist
= new LuceneBitArray (secondary_searcher
);
232 foreach (Uri uri
in search_subset_uris
) {
233 primary_whitelist
.AddUri (uri
);
234 if (secondary_whitelist
!= null)
235 secondary_whitelist
.AddUri (uri
);
237 primary_whitelist
.FlushUris ();
238 if (secondary_whitelist
!= null)
239 secondary_whitelist
.FlushUris ();
243 // Build blacklists from our prohibited parts.
245 LuceneBitArray primary_blacklist
= null;
246 LuceneBitArray secondary_blacklist
= null;
248 if (primary_prohibited_part_query
!= null) {
249 primary_blacklist
= new LuceneBitArray (primary_searcher
,
250 primary_prohibited_part_query
);
252 if (secondary_searcher
!= null) {
253 secondary_blacklist
= new LuceneBitArray (secondary_searcher
);
254 if (secondary_prohibited_part_query
!= null)
255 secondary_blacklist
.Or (secondary_prohibited_part_query
);
256 primary_blacklist
.Join (secondary_blacklist
);
261 // Combine our whitelist and blacklist into just a whitelist.
263 if (primary_blacklist
!= null) {
264 if (primary_whitelist
== null) {
265 primary_blacklist
.Not ();
266 primary_whitelist
= primary_blacklist
;
268 primary_whitelist
.AndNot (primary_blacklist
);
272 if (secondary_blacklist
!= null) {
273 if (secondary_whitelist
== null) {
274 secondary_blacklist
.Not ();
275 secondary_whitelist
= secondary_blacklist
;
277 secondary_whitelist
.AndNot (secondary_blacklist
);
281 BetterBitArray primary_matches
= null;
283 if (primary_required_part_queries
!= null) {
285 if (secondary_searcher
!= null)
286 primary_matches
= DoRequiredQueries_TwoIndex (primary_searcher
,
288 primary_required_part_queries
,
289 secondary_required_part_queries
,
291 secondary_whitelist
);
293 primary_matches
= DoRequiredQueries (primary_searcher
,
294 primary_required_part_queries
,
301 Logger
.Log
.Debug ("###### Finished low-level queries in {0}", sw
);
305 // Only generate results if we got some matches
306 if (primary_matches
!= null && primary_matches
.ContainsTrue ()) {
307 GenerateQueryResults (primary_reader
,
315 new HitFilter (all_hit_filters
.HitFilter
));
319 // Finally, we clean up after ourselves.
322 primary_searcher
.Close ();
323 if (secondary_searcher
!= null)
324 secondary_searcher
.Close ();
325 ReleaseReader (primary_reader
);
326 if (secondary_reader
!= null)
327 ReleaseReader (secondary_reader
);
332 Logger
.Log
.Debug ("###### Processed query in {0}", sw
);
336 ////////////////////////////////////////////////////////////////
339 // Special logic for handling our set of required queries
342 // This is the easy case: we just combine all of the queries
343 // into one big BooleanQuery.
344 private static BetterBitArray
DoRequiredQueries (LNS
.IndexSearcher primary_searcher
,
345 ArrayList primary_queries
,
346 BetterBitArray primary_whitelist
)
348 LNS
.BooleanQuery combined_query
;
349 combined_query
= new LNS
.BooleanQuery ();
350 foreach (LNS
.Query query
in primary_queries
)
351 combined_query
.Add (query
, true, false);
353 LuceneBitArray matches
;
354 matches
= new LuceneBitArray (primary_searcher
, combined_query
);
355 if (primary_whitelist
!= null)
356 matches
.And (primary_whitelist
);
361 // This code attempts to execute N required queries in the
362 // most efficient order to minimize the amount of time spent
363 // joining between the two indexes. It returns a joined bit
364 // array of matches against the primary index.
366 private class MatchInfo
: IComparable
{
368 public LuceneBitArray PrimaryMatches
= null;
369 public LuceneBitArray SecondaryMatches
= null;
370 public int UpperBound
= 0;
374 PrimaryMatches
.Join (SecondaryMatches
);
377 public void RestrictBy (MatchInfo joined
)
379 if (joined
!= null) {
380 this.PrimaryMatches
.And (joined
.PrimaryMatches
);
381 this.SecondaryMatches
.And (joined
.SecondaryMatches
);
385 UpperBound
+= PrimaryMatches
.TrueCount
;
386 UpperBound
+= SecondaryMatches
.TrueCount
;
389 public int CompareTo (object obj
)
391 MatchInfo other
= (MatchInfo
) obj
;
392 return this.UpperBound
- other
.UpperBound
;
396 // Any whitelists that are passed in must be fully joined, or
397 // query results will be incorrect.
398 private static BetterBitArray
DoRequiredQueries_TwoIndex (LNS
.IndexSearcher primary_searcher
,
399 LNS
.IndexSearcher secondary_searcher
,
400 ArrayList primary_queries
,
401 ArrayList secondary_queries
,
402 BetterBitArray primary_whitelist
,
403 BetterBitArray secondary_whitelist
)
405 ArrayList match_info_list
;
406 match_info_list
= new ArrayList ();
408 // First, do all of the low-level queries
409 // and store them in our MatchInfo
410 for (int i
= 0; i
< primary_queries
.Count
; ++i
) {
412 pq
= primary_queries
[i
] as LNS
.Query
;
413 sq
= secondary_queries
[i
] as LNS
.Query
;
415 LuceneBitArray p_matches
= null, s_matches
= null;
416 p_matches
= new LuceneBitArray (primary_searcher
);
419 if (primary_whitelist
!= null)
420 p_matches
.And (primary_whitelist
);
423 s_matches
= new LuceneBitArray (secondary_searcher
);
426 if (secondary_whitelist
!= null)
427 s_matches
.And (secondary_whitelist
);
431 info
= new MatchInfo ();
432 info
.PrimaryMatches
= p_matches
;
433 info
.SecondaryMatches
= s_matches
;
434 info
.RestrictBy (null); // a hack to initialize the UpperBound
435 match_info_list
.Add (info
);
438 // We want to be smart about the order we do this in,
439 // to minimize the expense of the Join.
440 while (match_info_list
.Count
> 1) {
442 // linear scan to find the minimum
444 for (int i
= 1; i
< match_info_list
.Count
; ++i
)
445 if (((MatchInfo
) match_info_list
[i
]).CompareTo ((MatchInfo
) match_info_list
[index_min
]) < 0)
449 smallest
= match_info_list
[index_min
] as MatchInfo
;
450 match_info_list
.RemoveAt (index_min
);
452 // We can short-circuit if our smallest set of
454 if (smallest
.UpperBound
== 0)
455 return smallest
.PrimaryMatches
; // this must be an empty array.
459 foreach (MatchInfo info
in match_info_list
)
460 info
.RestrictBy (smallest
);
463 // For the final pair, we don't need to do a full join:
464 // mapping the secondary onto the primary is sufficient
466 last
= match_info_list
[0] as MatchInfo
;
467 last
.SecondaryMatches
.ProjectOnto (last
.PrimaryMatches
);
469 return last
.PrimaryMatches
;
472 ////////////////////////////////////////////////////////////////
474 static private void ScoreHits (Hashtable hits_by_id
,
476 ICollection term_list
)
479 sw
= new Stopwatch ();
482 LNS
.Similarity similarity
;
483 similarity
= LNS
.Similarity
.GetDefault ();
485 foreach (Term term
in term_list
) {
488 idf
= similarity
.Idf (reader
.DocFreq (term
), reader
.MaxDoc ());
491 hit_count
= hits_by_id
.Count
;
494 term_docs
= reader
.TermDocs (term
);
495 while (term_docs
.Next () && hit_count
> 0) {
498 id
= term_docs
.Doc ();
501 hit
= hits_by_id
[id
] as Hit
;
504 tf
= similarity
.Tf (term_docs
.Freq ());
505 hit
.Score
+= tf
* idf
;
514 ////////////////////////////////////////////////////////////////
516 private class DocAndId
{
522 // Given a set of hits, broadcast some set out as our query
526 private static void GenerateQueryResults (IndexReader primary_reader
,
527 LNS
.IndexSearcher primary_searcher
,
528 LNS
.IndexSearcher secondary_searcher
,
529 BetterBitArray primary_matches
,
531 ICollection query_term_list
,
533 UriFilter uri_filter
,
534 HitFilter hit_filter
)
536 TopScores top_docs
= null;
537 ArrayList all_docs
= null;
540 Logger
.Log
.Debug (">>> Initially handed {0} matches", primary_matches
.TrueCount
);
542 if (primary_matches
.TrueCount
<= max_results
) {
544 Logger
.Log
.Debug (">>> Initial count is within our limit of {0}", max_results
);
545 all_docs
= new ArrayList ();
548 Logger
.Log
.Debug (">>> Number of hits is capped at {0}", max_results
);
549 top_docs
= new TopScores (max_results
);
552 Stopwatch total
, a
, b
, c
;
553 total
= new Stopwatch ();
554 a
= new Stopwatch ();
555 b
= new Stopwatch ();
556 c
= new Stopwatch ();
561 // Pull in the primary documents.
562 // We walk across them backwards, since newer
563 // documents are more likely to be at the end of
565 int j
= primary_matches
.Count
;
568 i
= primary_matches
.GetPreviousTrueIndex (j
);
571 j
= i
-1; // This way we can't forget to adjust i
574 doc
= primary_searcher
.Doc (i
);
576 // Check the timestamp --- if we have already reached our
577 // limit, we might be able to reject it immediately.
578 string timestamp_str
;
579 long timestamp_num
= 0;
581 timestamp_str
= doc
.Get ("Timestamp");
582 if (timestamp_str
== null) {
583 Logger
.Log
.Warn ("No timestamp on {0}!", GetUriFromDocument (doc
));
585 timestamp_num
= Int64
.Parse (doc
.Get ("Timestamp"));
586 if (top_docs
!= null && ! top_docs
.WillAccept (timestamp_num
))
590 // If we have a UriFilter, apply it.
591 if (uri_filter
!= null) {
593 uri
= GetUriFromDocument (doc
);
594 if (! uri_filter (uri
))
598 DocAndId doc_and_id
= new DocAndId ();
599 doc_and_id
.Doc
= doc
;
602 // Add the document to the appropriate data structure.
603 // We use the timestamp_num as the score, so high
604 // scores correspond to more-recent timestamps.
605 if (all_docs
!= null)
606 all_docs
.Add (doc_and_id
);
608 top_docs
.Add (timestamp_num
, doc_and_id
);
615 ICollection final_list_of_docs
;
616 if (all_docs
!= null)
617 final_list_of_docs
= all_docs
;
619 final_list_of_docs
= top_docs
.TopScoringObjects
;
621 ArrayList final_list_of_hits
;
622 final_list_of_hits
= new ArrayList (final_list_of_docs
.Count
);
624 // This is used only for scoring
625 Hashtable hits_by_id
= null;
626 hits_by_id
= new Hashtable ();
628 // If we aren't using the secondary index, the next step is
629 // very straightforward.
630 if (secondary_searcher
== null) {
632 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
634 hit
= DocumentToHit (doc_and_id
.Doc
);
635 hits_by_id
[doc_and_id
.Id
] = hit
;
636 final_list_of_hits
.Add (hit
);
642 Logger
.Log
.Debug (">>> Performing cross-index Hit reunification");
644 Hashtable hits_by_uri
;
645 hits_by_uri
= UriFu
.NewHashtable ();
647 LuceneBitArray secondary_matches
;
648 secondary_matches
= new LuceneBitArray (secondary_searcher
);
650 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
652 hit
= DocumentToHit (doc_and_id
.Doc
);
653 hits_by_id
[doc_and_id
.Id
] = hit
;
654 hits_by_uri
[hit
.Uri
] = hit
;
655 secondary_matches
.AddUri (hit
.Uri
);
658 secondary_matches
.FlushUris ();
660 // Attach all of our secondary properties
665 i
= secondary_matches
.GetNextTrueIndex (j
);
666 if (i
>= secondary_matches
.Count
)
670 Document secondary_doc
;
671 secondary_doc
= secondary_searcher
.Doc (i
);
674 uri
= GetUriFromDocument (secondary_doc
);
677 hit
= hits_by_uri
[uri
] as Hit
;
679 AddPropertiesToHit (hit
, secondary_doc
, false);
681 final_list_of_hits
.Add (hit
);
685 ScoreHits (hits_by_id
, primary_reader
, query_term_list
);
689 // If we used the TopScores object, we got our original
690 // list of documents sorted for us. If not, sort the
692 if (top_docs
== null)
693 final_list_of_hits
.Sort ();
697 // If we have a hit_filter, use it now.
698 if (hit_filter
!= null) {
699 for (int i
= 0; i
< final_list_of_hits
.Count
; ++i
) {
701 hit
= final_list_of_hits
[i
] as Hit
;
702 if (! hit_filter (hit
)) {
704 Logger
.Log
.Debug ("Filtered out {0}", hit
.Uri
);
705 final_list_of_hits
[i
] = null;
710 // Before we broadcast a hit, we strip out any
711 // properties in the PrivateNamespace. We
712 // manipulate the property ArrayList directory,
713 // which is pretty gross... but this is safe,
714 // since removing items will not change the sort
716 foreach (Hit hit
in final_list_of_hits
) {
720 while (i
< hit
.Properties
.Count
) {
721 Property prop
= hit
.Properties
[i
] as Property
;
722 if (prop
.Key
.StartsWith (PrivateNamespace
))
723 hit
.Properties
.RemoveAt (i
);
729 result
.Add (final_list_of_hits
);
735 Logger
.Log
.Debug (">>> GenerateQueryResults time statistics:");
736 Logger
.Log
.Debug (">>> First pass {0} ({1:0.0}%)", a
, 100 * a
.ElapsedTime
/ total
.ElapsedTime
);
737 Logger
.Log
.Debug (">>> Hit assembly {0} ({1:0.0}%)", b
, 100 * b
.ElapsedTime
/ total
.ElapsedTime
);
738 Logger
.Log
.Debug (">>> Final pass {0} ({1:0.0}%)", c
, 100 * c
.ElapsedTime
/ total
.ElapsedTime
);
739 Logger
.Log
.Debug (">>> TOTAL {0}", total
);