2 // LuceneQueryingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
29 using System
.Diagnostics
;
30 using System
.Globalization
;
33 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Analysis
;
38 using Lucene
.Net
.Analysis
.Standard
;
39 using Lucene
.Net
.Documents
;
40 using Lucene
.Net
.Index
;
41 using Lucene
.Net
.QueryParsers
;
42 using LNS
= Lucene
.Net
.Search
;
46 namespace Beagle
.Daemon
{
48 public class LuceneQueryingDriver
: LuceneCommon
{
50 static public bool Debug
= false;
52 public const string PrivateNamespace
= "_private:";
54 public delegate bool UriFilter (Uri uri
);
55 public delegate double RelevancyMultiplier (Hit hit
);
57 public LuceneQueryingDriver (string index_name
, int minor_version
, bool read_only
)
58 : base (index_name
, minor_version
)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
63 // FIXME: Do something sane if we're in read-only mode and want to create an index.
68 ////////////////////////////////////////////////////////////////
70 private class NotHitFilter_Closure
{
73 public NotHitFilter_Closure (HitFilter original
)
75 this.original
= original
;
78 public bool Filter (Hit hit
)
80 return ! original (hit
);
84 public HitFilter
NotHitFilter (HitFilter filter
)
86 NotHitFilter_Closure closure
;
87 closure
= new NotHitFilter_Closure (filter
);
88 return new HitFilter (closure
.Filter
);
91 ////////////////////////////////////////////////////////////////
93 // Returns the lowest matching score before the results are
95 public void DoQuery (Query query
,
97 ICollection search_subset_uris
, // should be internal uris
102 sw
= new Stopwatch ();
105 // Assemble all of the parts into a bunch of Lucene queries
107 ArrayList primary_required_part_queries
= null;
108 ArrayList secondary_required_part_queries
= null;
110 LNS
.BooleanQuery primary_prohibited_part_query
= null;
111 LNS
.BooleanQuery secondary_prohibited_part_query
= null;
113 ArrayList all_hit_filters
= new ArrayList ();
114 all_hit_filters
.Add (hit_filter
);
116 ArrayList term_list
= new ArrayList ();
118 foreach (QueryPart part
in query
.Parts
) {
119 LNS
.Query primary_part_query
;
120 LNS
.Query secondary_part_query
;
121 HitFilter part_hit_filter
;
122 QueryPartToQuery (part
,
123 false, // we want both primary and secondary queries
124 part
.Logic
== QueryPartLogic
.Required
? term_list
: null,
125 out primary_part_query
,
126 out secondary_part_query
,
127 out part_hit_filter
);
129 switch (part
.Logic
) {
131 case QueryPartLogic
.Required
:
132 if (primary_required_part_queries
== null) {
133 primary_required_part_queries
= new ArrayList ();
134 secondary_required_part_queries
= new ArrayList ();
136 primary_required_part_queries
.Add (primary_part_query
);
137 secondary_required_part_queries
.Add (secondary_part_query
);
139 if (part_hit_filter
!= null)
140 all_hit_filters
.Add (part_hit_filter
);
144 case QueryPartLogic
.Prohibited
:
145 if (primary_prohibited_part_query
== null)
146 primary_prohibited_part_query
= new LNS
.BooleanQuery ();
147 primary_prohibited_part_query
.Add (primary_part_query
, false, false);
149 if (secondary_part_query
!= null) {
150 if (secondary_prohibited_part_query
== null)
151 secondary_prohibited_part_query
= new LNS
.BooleanQuery ();
152 secondary_prohibited_part_query
.Add (secondary_part_query
, false, false);
155 if (part_hit_filter
!= null)
156 all_hit_filters
.Add (NotHitFilter (part_hit_filter
));
162 // If we have no required parts, give up.
163 if (primary_required_part_queries
== null)
167 // Now that we have all of these nice queries, let's execute them!
170 // Create the searchers that we will need.
172 IndexReader primary_reader
;
173 LNS
.IndexSearcher primary_searcher
;
174 IndexReader secondary_reader
= null;
175 LNS
.IndexSearcher secondary_searcher
= null;
177 primary_reader
= IndexReader
.Open (PrimaryStore
);
178 primary_searcher
= new LNS
.IndexSearcher (primary_reader
);
180 if (SecondaryStore
!= null) {
181 secondary_reader
= IndexReader
.Open (SecondaryStore
);
182 if (secondary_reader
.NumDocs () == 0) {
183 secondary_reader
.Close ();
184 secondary_reader
= null;
188 if (secondary_reader
!= null)
189 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
192 // Possibly create our whitelists from the search subset.
194 LuceneBitArray primary_whitelist
= null;
195 LuceneBitArray secondary_whitelist
= null;
197 if (search_subset_uris
!= null && search_subset_uris
.Count
> 0) {
198 primary_whitelist
= new LuceneBitArray (primary_searcher
);
199 if (secondary_searcher
!= null)
200 secondary_whitelist
= new LuceneBitArray (secondary_searcher
);
202 foreach (Uri uri
in search_subset_uris
) {
203 primary_whitelist
.AddUri (uri
);
204 if (secondary_whitelist
!= null)
205 secondary_whitelist
.AddUri (uri
);
207 primary_whitelist
.FlushUris ();
208 if (secondary_whitelist
!= null)
209 secondary_whitelist
.FlushUris ();
213 // Build blacklists from our prohibited parts.
215 LuceneBitArray primary_blacklist
= null;
216 LuceneBitArray secondary_blacklist
= null;
218 if (primary_prohibited_part_query
!= null) {
219 primary_blacklist
= new LuceneBitArray (primary_searcher
,
220 primary_prohibited_part_query
);
222 secondary_blacklist
= new LuceneBitArray (secondary_searcher
);
223 if (secondary_prohibited_part_query
!= null)
224 secondary_blacklist
.Or (secondary_prohibited_part_query
);
225 primary_blacklist
.Join (secondary_blacklist
);
229 // Combine our whitelist and blacklist into just a whitelist.
231 if (primary_blacklist
!= null) {
232 if (primary_whitelist
== null) {
233 primary_blacklist
.Not ();
234 primary_whitelist
= primary_blacklist
;
236 primary_whitelist
.AndNot (primary_blacklist
);
240 if (secondary_blacklist
!= null) {
241 if (secondary_whitelist
== null) {
242 secondary_blacklist
.Not ();
243 secondary_whitelist
= secondary_blacklist
;
245 secondary_whitelist
.AndNot (secondary_blacklist
);
249 BetterBitArray primary_matches
= null;
251 if (primary_required_part_queries
!= null) {
253 if (secondary_searcher
!= null)
254 primary_matches
= DoRequiredQueries_TwoIndex (primary_searcher
,
256 primary_required_part_queries
,
257 secondary_required_part_queries
,
259 secondary_whitelist
);
261 primary_matches
= DoRequiredQueries (primary_searcher
,
262 primary_required_part_queries
,
269 Logger
.Log
.Debug ("###### Finished low-level queries in {0}", sw
);
273 // If we didn't get any matches, give up.
275 if (primary_matches
== null || ! primary_matches
.ContainsTrue ())
278 GenerateQueryResults (primary_reader
,
290 // Finally, we clean up after ourselves.
293 primary_reader
.Close ();
294 if (secondary_reader
!= null)
295 secondary_reader
.Close ();
296 primary_searcher
.Close ();
297 if (secondary_searcher
!= null)
298 secondary_searcher
.Close ();
303 Logger
.Log
.Debug ("###### Processed query in {0}", sw
);
307 ////////////////////////////////////////////////////////////////
310 // Special logic for handling our set of required queries
313 // This is the easy case: we just combine all of the queries
314 // into one big BooleanQuery.
315 private static BetterBitArray
DoRequiredQueries (LNS
.IndexSearcher primary_searcher
,
316 ArrayList primary_queries
,
317 BetterBitArray primary_whitelist
)
319 LNS
.BooleanQuery combined_query
;
320 combined_query
= new LNS
.BooleanQuery ();
321 foreach (LNS
.Query query
in primary_queries
)
322 combined_query
.Add (query
, true, false);
324 LuceneBitArray matches
;
325 matches
= new LuceneBitArray (primary_searcher
, combined_query
);
326 if (primary_whitelist
!= null)
327 matches
.And (primary_whitelist
);
332 // This code attempts to execute N required queries in the
333 // most efficient order to minimize the amount of time spent
334 // joining between the two indexes. It returns a joined bit
335 // array of matches against the primary index.
337 private class MatchInfo
: IComparable
{
339 public LuceneBitArray PrimaryMatches
= null;
340 public LuceneBitArray SecondaryMatches
= null;
341 public int UpperBound
= 0;
345 PrimaryMatches
.Join (SecondaryMatches
);
348 public void RestrictBy (MatchInfo joined
)
350 if (joined
!= null) {
351 this.PrimaryMatches
.And (joined
.PrimaryMatches
);
352 this.SecondaryMatches
.And (joined
.SecondaryMatches
);
356 UpperBound
+= PrimaryMatches
.TrueCount
;
357 UpperBound
+= SecondaryMatches
.TrueCount
;
360 public int CompareTo (object obj
)
362 MatchInfo other
= (MatchInfo
) obj
;
363 return this.UpperBound
- other
.UpperBound
;
367 // Any whitelists that are passed in must be fully joined, or
368 // query results will be incorrect.
369 private static BetterBitArray
DoRequiredQueries_TwoIndex (LNS
.IndexSearcher primary_searcher
,
370 LNS
.IndexSearcher secondary_searcher
,
371 ArrayList primary_queries
,
372 ArrayList secondary_queries
,
373 BetterBitArray primary_whitelist
,
374 BetterBitArray secondary_whitelist
)
376 ArrayList match_info_list
;
377 match_info_list
= new ArrayList ();
379 // First, do all of the low-level queries
380 // and store them in our MatchInfo
381 for (int i
= 0; i
< primary_queries
.Count
; ++i
) {
383 pq
= primary_queries
[i
] as LNS
.Query
;
384 sq
= secondary_queries
[i
] as LNS
.Query
;
386 LuceneBitArray p_matches
= null, s_matches
= null;
387 p_matches
= new LuceneBitArray (primary_searcher
);
390 if (primary_whitelist
!= null)
391 p_matches
.And (primary_whitelist
);
394 s_matches
= new LuceneBitArray (secondary_searcher
);
397 if (secondary_whitelist
!= null)
398 s_matches
.And (secondary_whitelist
);
402 info
= new MatchInfo ();
403 info
.PrimaryMatches
= p_matches
;
404 info
.SecondaryMatches
= s_matches
;
405 info
.RestrictBy (null); // a hack to initialize the UpperBound
406 match_info_list
.Add (info
);
409 // We want to be smart about the order we do this in,
410 // to minimize the expense of the Join.
411 while (match_info_list
.Count
> 1) {
413 // FIXME: We don't really need to sort here, it would
414 // be sufficient to just find the minimal element.
415 match_info_list
.Sort ();
417 smallest
= match_info_list
[0] as MatchInfo
;
418 match_info_list
.RemoveAt (0);
420 // We can short-circuit if our smallest set of
422 if (smallest
.UpperBound
== 0)
423 return smallest
.PrimaryMatches
; // this must be an empty array.
427 foreach (MatchInfo info
in match_info_list
)
428 info
.RestrictBy (smallest
);
431 // For the final pair, we don't need to do a full join:
432 // mapping the secondary onto the primary is sufficient
434 last
= match_info_list
[0] as MatchInfo
;
435 last
.SecondaryMatches
.ProjectOnto (last
.PrimaryMatches
);
437 return last
.PrimaryMatches
;
440 ////////////////////////////////////////////////////////////////
442 static private void ScoreHits (Hashtable hits_by_id
,
444 ICollection term_list
)
447 sw
= new Stopwatch ();
450 LNS
.Similarity similarity
;
451 similarity
= LNS
.Similarity
.GetDefault ();
453 foreach (Term term
in term_list
) {
456 idf
= similarity
.Idf (reader
.DocFreq (term
), reader
.MaxDoc ());
459 hit_count
= hits_by_id
.Count
;
462 term_docs
= reader
.TermDocs (term
);
463 while (term_docs
.Next () && hit_count
> 0) {
466 id
= term_docs
.Doc ();
469 hit
= hits_by_id
[id
] as Hit
;
472 tf
= similarity
.Tf (term_docs
.Freq ());
473 hit
.Score
+= tf
* idf
;
480 Logger
.Log
.Debug ("Scored {0} hits in {1}", hits_by_id
.Count
, sw
);
483 ////////////////////////////////////////////////////////////////
485 private class DocAndId
{
491 // Given a set of hits, broadcast some set out as our query
495 private static void GenerateQueryResults (IndexReader primary_reader
,
496 LNS
.IndexSearcher primary_searcher
,
497 LNS
.IndexSearcher secondary_searcher
,
498 BetterBitArray primary_matches
,
500 ICollection query_term_list
,
504 UriFilter uri_filter
,
505 HitFilter hit_filter
)
507 TopScores top_docs
= null;
508 ArrayList all_docs
= null;
510 long min_date_num
, max_date_num
;
511 min_date_num
= Int64
.Parse (StringFu
.DateTimeToString (min_date
));
512 max_date_num
= Int64
.Parse (StringFu
.DateTimeToString (max_date
));
514 if (max_date_num
< min_date_num
)
518 Logger
.Log
.Debug (">>> Initially handed {0} matches", primary_matches
.TrueCount
);
520 if (primary_matches
.TrueCount
<= max_results
) {
522 Logger
.Log
.Debug (">>> Initial count is within our limit of {0}", max_results
);
523 all_docs
= new ArrayList ();
526 Logger
.Log
.Debug (">>> Number of hits is capped at {0}", max_results
);
527 top_docs
= new TopScores (max_results
);
530 Stopwatch total
, a
, b
, c
;
531 total
= new Stopwatch ();
532 a
= new Stopwatch ();
533 b
= new Stopwatch ();
534 c
= new Stopwatch ();
539 // Pull in the primary documents.
540 // We walk across them backwards, since newer
541 // documents are more likely to be at the end of
543 int j
= primary_matches
.Count
;
546 i
= primary_matches
.GetPreviousTrueIndex (j
);
549 j
= i
-1; // This way we can't forget to adjust i
552 doc
= primary_searcher
.Doc (i
);
554 // Check the timestamp to make sure it is in range
556 timestamp_num
= Int64
.Parse (doc
.Get ("Timestamp"));
557 if (timestamp_num
< min_date_num
|| max_date_num
< timestamp_num
)
560 if (top_docs
!= null && ! top_docs
.WillAccept (timestamp_num
))
563 // If we have a UriFilter, apply it.
564 if (uri_filter
!= null) {
566 uri
= GetUriFromDocument (doc
);
567 if (! uri_filter (uri
))
571 DocAndId doc_and_id
= new DocAndId ();
572 doc_and_id
.Doc
= doc
;
575 // Add the document to the appropriate data structure.
576 // We use the timestamp_num as the score, so high
577 // scores correspond to more-recent timestamps.
578 if (all_docs
!= null)
579 all_docs
.Add (doc_and_id
);
581 top_docs
.Add (timestamp_num
, doc_and_id
);
588 ICollection final_list_of_docs
;
589 if (all_docs
!= null)
590 final_list_of_docs
= all_docs
;
592 final_list_of_docs
= top_docs
.TopScoringObjects
;
594 ArrayList final_list_of_hits
;
595 final_list_of_hits
= new ArrayList (final_list_of_docs
.Count
);
597 // This is used only for scoring
598 Hashtable hits_by_id
= null;
599 hits_by_id
= new Hashtable ();
601 // If we aren't using the secondary index, the next step is
602 // very straightforward.
603 if (secondary_searcher
== null) {
605 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
607 hit
= DocumentToHit (doc_and_id
.Doc
);
608 hits_by_id
[doc_and_id
.Id
] = hit
;
609 final_list_of_hits
.Add (hit
);
615 Logger
.Log
.Debug (">>> Performing cross-index Hit reunification");
617 Hashtable hits_by_uri
;
618 hits_by_uri
= UriFu
.NewHashtable ();
620 LuceneBitArray secondary_matches
;
621 secondary_matches
= new LuceneBitArray (secondary_searcher
);
623 foreach (DocAndId doc_and_id
in final_list_of_docs
) {
625 hit
= DocumentToHit (doc_and_id
.Doc
);
626 hits_by_id
[doc_and_id
.Id
] = hit
;
627 hits_by_uri
[hit
.Uri
] = hit
;
628 secondary_matches
.AddUri (hit
.Uri
);
631 secondary_matches
.FlushUris ();
633 // Attach all of our secondary properties
638 i
= secondary_matches
.GetNextTrueIndex (j
);
639 if (i
>= secondary_matches
.Count
)
643 Document secondary_doc
;
644 secondary_doc
= secondary_searcher
.Doc (i
);
647 uri
= GetUriFromDocument (secondary_doc
);
650 hit
= hits_by_uri
[uri
] as Hit
;
652 AddPropertiesToHit (hit
, secondary_doc
, false);
654 final_list_of_hits
.Add (hit
);
658 ScoreHits (hits_by_id
, primary_reader
, query_term_list
);
662 // If we used the TopScores object, we got our original
663 // list of documents sorted for us. If not, sort the
665 if (top_docs
== null)
666 final_list_of_hits
.Sort ();
670 // If we have a hit_filter, use it now.
671 if (hit_filter
!= null) {
672 for (int i
= 0; i
< final_list_of_hits
.Count
; ++i
) {
674 hit
= final_list_of_hits
[i
] as Hit
;
675 if (! hit_filter (hit
)) {
677 Logger
.Log
.Debug ("Filtered out {0}", hit
.Uri
);
678 final_list_of_hits
[i
] = null;
683 // Before we broadcast a hit, we strip out any
684 // properties in the PrivateNamespace. We
685 // manipulate the property ArrayList directory,
686 // which is pretty gross... but this is safe,
687 // since removing items will not change the sort
689 foreach (Hit hit
in final_list_of_hits
) {
693 while (i
< hit
.Properties
.Count
) {
694 Property prop
= hit
.Properties
[i
] as Property
;
695 if (prop
.Key
.StartsWith (PrivateNamespace
))
696 hit
.Properties
.RemoveAt (i
);
702 result
.Add (final_list_of_hits
);
708 Logger
.Log
.Debug (">>> GenerateQueryResults time statistics:");
709 Logger
.Log
.Debug (">>> First pass {0} ({1:0.0}%)", a
, 100 * a
.ElapsedTime
/ total
.ElapsedTime
);
710 Logger
.Log
.Debug (">>> Hit assembly {0} ({1:0.0}%)", b
, 100 * b
.ElapsedTime
/ total
.ElapsedTime
);
711 Logger
.Log
.Debug (">>> Final pass {0} ({1:0.0}%)", c
, 100 * c
.ElapsedTime
/ total
.ElapsedTime
);
712 Logger
.Log
.Debug (">>> TOTAL {0}", total
);