2 // LuceneQueryingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
29 using System
.Diagnostics
;
30 using System
.Globalization
;
33 using System
.Threading
;
35 using System
.Xml
.Serialization
;
37 using Lucene
.Net
.Analysis
;
38 using Lucene
.Net
.Analysis
.Standard
;
39 using Lucene
.Net
.Documents
;
40 using Lucene
.Net
.Index
;
41 using Lucene
.Net
.QueryParsers
;
42 using LNS
= Lucene
.Net
.Search
;
46 namespace Beagle
.Daemon
{
48 public class LuceneQueryingDriver
: LuceneCommon
{
50 static public bool Debug
= true;
52 public const string PrivateNamespace
= "_private:";
54 public delegate bool UriFilter (Uri uri
);
55 public delegate double RelevancyMultiplier (Hit hit
);
57 public LuceneQueryingDriver (string index_name
, int minor_version
, bool read_only
)
58 : base (index_name
, minor_version
)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
63 // FIXME: Do something sane if we're in read-only mode and want to create an index.
68 ////////////////////////////////////////////////////////////////
70 private class NotHitFilter_Closure
{
73 public NotHitFilter_Closure (HitFilter original
)
75 this.original
= original
;
78 public bool Filter (Hit hit
)
80 return ! original (hit
);
84 public HitFilter
NotHitFilter (HitFilter filter
)
86 NotHitFilter_Closure closure
;
87 closure
= new NotHitFilter_Closure (filter
);
88 return new HitFilter (closure
.Filter
);
91 ////////////////////////////////////////////////////////////////
93 // Returns the lowest matching score before the results are
95 public void DoQuery (Query query
,
97 ICollection search_subset_uris
, // should be internal uris
102 sw
= new Stopwatch ();
105 // Assemble all of the parts into a bunch of Lucene queries
107 ArrayList primary_required_part_queries
= null;
108 ArrayList secondary_required_part_queries
= null;
110 LNS
.BooleanQuery primary_prohibited_part_query
= null;
111 LNS
.BooleanQuery secondary_prohibited_part_query
= null;
113 ArrayList all_hit_filters
= new ArrayList ();
114 all_hit_filters
.Add (hit_filter
);
116 foreach (QueryPart part
in query
.Parts
) {
117 LNS
.Query primary_part_query
;
118 LNS
.Query secondary_part_query
;
119 HitFilter part_hit_filter
;
120 QueryPartToQuery (part
,
121 false, // we want both primary and secondary queries
122 out primary_part_query
,
123 out secondary_part_query
,
124 out part_hit_filter
);
126 switch (part
.Logic
) {
128 case QueryPartLogic
.Required
:
129 if (primary_required_part_queries
== null) {
130 primary_required_part_queries
= new ArrayList ();
131 secondary_required_part_queries
= new ArrayList ();
133 primary_required_part_queries
.Add (primary_part_query
);
134 secondary_required_part_queries
.Add (secondary_part_query
);
136 if (part_hit_filter
!= null)
137 all_hit_filters
.Add (part_hit_filter
);
141 case QueryPartLogic
.Prohibited
:
142 if (primary_prohibited_part_query
== null)
143 primary_prohibited_part_query
= new LNS
.BooleanQuery ();
144 primary_prohibited_part_query
.Add (primary_part_query
, false, false);
146 if (secondary_part_query
!= null) {
147 if (secondary_prohibited_part_query
== null)
148 secondary_prohibited_part_query
= new LNS
.BooleanQuery ();
149 secondary_prohibited_part_query
.Add (secondary_part_query
, false, false);
152 if (part_hit_filter
!= null)
153 all_hit_filters
.Add (NotHitFilter (part_hit_filter
));
159 // If we have no required parts, give up.
160 if (primary_required_part_queries
== null) {
161 Logger
.Log
.Debug ("No required parts, I give up!");
165 Logger
.Log
.Debug ("Assembled queries!");
168 // Now that we have all of these nice queries, let's execute them!
171 // Create the searchers that we will need.
173 LNS
.IndexSearcher primary_searcher
;
174 IndexReader secondary_reader
= null;
175 LNS
.IndexSearcher secondary_searcher
= null;
177 primary_searcher
= new LNS
.IndexSearcher (PrimaryStore
);
179 if (SecondaryStore
!= null) {
180 secondary_reader
= IndexReader
.Open (SecondaryStore
);
181 if (secondary_reader
.NumDocs () == 0) {
182 secondary_reader
.Close ();
183 secondary_reader
= null;
187 if (secondary_reader
!= null)
188 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
191 // Possibly create our whitelists from the search subset.
193 LuceneBitArray primary_whitelist
= null;
194 LuceneBitArray secondary_whitelist
= null;
196 if (search_subset_uris
!= null && search_subset_uris
.Count
> 0) {
197 primary_whitelist
= new LuceneBitArray (primary_searcher
);
198 if (secondary_searcher
!= null)
199 secondary_whitelist
= new LuceneBitArray (secondary_searcher
);
201 foreach (Uri uri
in search_subset_uris
) {
202 primary_whitelist
.AddUri (uri
);
203 if (secondary_whitelist
!= null)
204 secondary_whitelist
.AddUri (uri
);
206 primary_whitelist
.FlushUris ();
207 if (secondary_whitelist
!= null)
208 secondary_whitelist
.FlushUris ();
212 // Build blacklists from our prohibited parts.
214 LuceneBitArray primary_blacklist
= null;
215 LuceneBitArray secondary_blacklist
= null;
217 if (primary_prohibited_part_query
!= null) {
218 primary_blacklist
= new LuceneBitArray (primary_searcher
,
219 primary_prohibited_part_query
);
221 secondary_blacklist
= new LuceneBitArray (secondary_searcher
);
222 if (secondary_prohibited_part_query
!= null)
223 secondary_blacklist
.Or (secondary_prohibited_part_query
);
224 primary_blacklist
.Join (secondary_blacklist
);
228 // Combine our whitelist and blacklist into just a whitelist.
230 if (primary_blacklist
!= null) {
231 if (primary_whitelist
== null) {
232 primary_blacklist
.Not ();
233 primary_whitelist
= primary_blacklist
;
235 primary_whitelist
.AndNot (primary_blacklist
);
239 if (secondary_blacklist
!= null) {
240 if (secondary_whitelist
== null) {
241 secondary_blacklist
.Not ();
242 secondary_whitelist
= secondary_blacklist
;
244 secondary_whitelist
.AndNot (secondary_blacklist
);
248 BetterBitArray primary_matches
= null;
250 if (primary_required_part_queries
!= null) {
252 if (secondary_searcher
!= null)
253 primary_matches
= DoRequiredQueries_TwoIndex (primary_searcher
,
255 primary_required_part_queries
,
256 secondary_required_part_queries
,
258 secondary_whitelist
);
260 primary_matches
= DoRequiredQueries (primary_searcher
,
261 primary_required_part_queries
,
268 Logger
.Log
.Debug ("###### Finished low-level queries in {0}", sw
);
272 // If we didn't get any matches, give up.
274 if (primary_matches
== null || ! primary_matches
.ContainsTrue ())
277 GenerateQueryResults (primary_searcher
,
287 // Finally, we clean up after ourselves.
290 if (secondary_reader
!= null)
291 secondary_reader
.Close ();
292 primary_searcher
.Close ();
293 if (secondary_searcher
!= null)
294 secondary_searcher
.Close ();
299 Logger
.Log
.Debug ("###### Processed query in {0}", sw
);
303 ////////////////////////////////////////////////////////////////
306 // Special logic for handling our set of required queries
309 // This is the easy case: we just combine all of the queries
310 // into one big BooleanQuery.
311 private static BetterBitArray
DoRequiredQueries (LNS
.IndexSearcher primary_searcher
,
312 ArrayList primary_queries
,
313 BetterBitArray primary_whitelist
)
315 LNS
.BooleanQuery combined_query
;
316 combined_query
= new LNS
.BooleanQuery ();
317 foreach (LNS
.Query query
in primary_queries
)
318 combined_query
.Add (query
, true, false);
320 LuceneBitArray matches
;
321 matches
= new LuceneBitArray (primary_searcher
, combined_query
);
322 if (primary_whitelist
!= null)
323 matches
.And (primary_whitelist
);
328 // This code attempts to execute N required queries in the
329 // most efficient order to minimize the amount of time spent
330 // joining between the two indexes. It returns a joined bit
331 // array of matches against the primary index.
333 private class MatchInfo
: IComparable
{
335 public LuceneBitArray PrimaryMatches
= null;
336 public LuceneBitArray SecondaryMatches
= null;
337 public int UpperBound
= 0;
341 PrimaryMatches
.Join (SecondaryMatches
);
344 public void RestrictBy (MatchInfo joined
)
346 if (joined
!= null) {
347 this.PrimaryMatches
.And (joined
.PrimaryMatches
);
348 this.SecondaryMatches
.And (joined
.SecondaryMatches
);
352 UpperBound
+= PrimaryMatches
.TrueCount
;
353 UpperBound
+= SecondaryMatches
.TrueCount
;
356 public int CompareTo (object obj
)
358 MatchInfo other
= (MatchInfo
) obj
;
359 return this.UpperBound
- other
.UpperBound
;
363 // Any whitelists that are passed in must be fully joined, or
364 // query results will be incorrect.
365 private static BetterBitArray
DoRequiredQueries_TwoIndex (LNS
.IndexSearcher primary_searcher
,
366 LNS
.IndexSearcher secondary_searcher
,
367 ArrayList primary_queries
,
368 ArrayList secondary_queries
,
369 BetterBitArray primary_whitelist
,
370 BetterBitArray secondary_whitelist
)
372 ArrayList match_info_list
;
373 match_info_list
= new ArrayList ();
375 // First, do all of the low-level queries
376 // and store them in our MatchInfo
377 for (int i
= 0; i
< primary_queries
.Count
; ++i
) {
379 pq
= primary_queries
[i
] as LNS
.Query
;
380 sq
= secondary_queries
[i
] as LNS
.Query
;
382 LuceneBitArray p_matches
= null, s_matches
= null;
383 p_matches
= new LuceneBitArray (primary_searcher
);
386 if (primary_whitelist
!= null)
387 p_matches
.And (primary_whitelist
);
390 s_matches
= new LuceneBitArray (secondary_searcher
);
393 if (secondary_whitelist
!= null)
394 s_matches
.And (secondary_whitelist
);
398 info
= new MatchInfo ();
399 info
.PrimaryMatches
= p_matches
;
400 info
.SecondaryMatches
= s_matches
;
401 info
.RestrictBy (null); // a hack to initialize the UpperBound
402 match_info_list
.Add (info
);
405 // We want to be smart about the order we do this in,
406 // to minimize the expense of the Join.
407 while (match_info_list
.Count
> 1) {
409 // FIXME: We don't really need to sort here, it would
410 // be sufficient to just find the minimal element.
411 match_info_list
.Sort ();
413 smallest
= match_info_list
[0] as MatchInfo
;
414 match_info_list
.RemoveAt (0);
416 // We can short-circuit if our smallest set of
418 if (smallest
.UpperBound
== 0)
419 return smallest
.PrimaryMatches
; // this must be an empty array.
423 foreach (MatchInfo info
in match_info_list
)
424 info
.RestrictBy (smallest
);
427 // For the final pair, we don't need to do a full join:
428 // mapping the secondary onto the primary is sufficient
430 last
= match_info_list
[0] as MatchInfo
;
431 last
.SecondaryMatches
.ProjectOnto (last
.PrimaryMatches
);
433 return last
.PrimaryMatches
;
436 ////////////////////////////////////////////////////////////////
439 // Given a set of hits, broadcast some set out as our query
443 private static void GenerateQueryResults (LNS
.IndexSearcher primary_searcher
,
444 LNS
.IndexSearcher secondary_searcher
,
445 BetterBitArray primary_matches
,
450 UriFilter uri_filter
,
451 HitFilter hit_filter
)
453 TopScores top_docs
= null;
454 ArrayList all_docs
= null;
456 long min_date_num
, max_date_num
;
457 min_date_num
= Int64
.Parse (StringFu
.DateTimeToString (min_date
));
458 max_date_num
= Int64
.Parse (StringFu
.DateTimeToString (max_date
));
460 if (max_date_num
< min_date_num
)
464 Logger
.Log
.Debug (">>> Initially handed {0} matches", primary_matches
.TrueCount
);
466 if (primary_matches
.TrueCount
<= max_results
) {
468 Logger
.Log
.Debug (">>> Initial count is within our limit of {0}", max_results
);
469 all_docs
= new ArrayList ();
472 Logger
.Log
.Debug (">>> Number of hits is capped at {0}", max_results
);
473 top_docs
= new TopScores (max_results
);
476 Stopwatch total
, a
, b
, c
;
477 total
= new Stopwatch ();
478 a
= new Stopwatch ();
479 b
= new Stopwatch ();
480 c
= new Stopwatch ();
485 // Pull in the primary documents.
486 // We walk across them backwards, since newer
487 // documents are more likely to be at the end of
489 int j
= primary_matches
.Count
;
492 i
= primary_matches
.GetPreviousTrueIndex (j
);
495 j
= i
-1; // This way we can't forget to adjust i
498 doc
= primary_searcher
.Doc (i
);
500 // Check the timestamp to make sure it is in range
502 timestamp_num
= Int64
.Parse (doc
.Get ("Timestamp"));
503 if (timestamp_num
< min_date_num
|| max_date_num
< timestamp_num
)
506 if (top_docs
!= null && ! top_docs
.WillAccept (timestamp_num
))
509 // If we have a UriFilter, apply it.
510 if (uri_filter
!= null) {
512 uri
= GetUriFromDocument (doc
);
513 if (! uri_filter (uri
))
517 // Add the document to the appropriate data structure.
518 // We use the timestamp_num as the score, so high
519 // scores correspond to more-recent timestamps.
520 if (all_docs
!= null)
523 top_docs
.Add (timestamp_num
, doc
);
530 ICollection final_list_of_docs
;
531 if (all_docs
!= null)
532 final_list_of_docs
= all_docs
;
534 final_list_of_docs
= top_docs
.TopScoringObjects
;
536 ArrayList final_list_of_hits
;
537 final_list_of_hits
= new ArrayList (final_list_of_docs
.Count
);
539 // If we aren't using the secondary index, the next step is
540 // very straightforward.
541 if (secondary_searcher
== null) {
543 foreach (Document doc
in final_list_of_docs
) {
545 hit
= DocumentToHit (doc
);
546 final_list_of_hits
.Add (hit
);
552 Logger
.Log
.Debug (">>> Performing cross-index Hit reunification");
554 Hashtable hits_by_uri
;
555 hits_by_uri
= UriFu
.NewHashtable ();
557 LuceneBitArray secondary_matches
;
558 secondary_matches
= new LuceneBitArray (secondary_searcher
);
560 foreach (Document primary_doc
in final_list_of_docs
) {
563 hit
= DocumentToHit (primary_doc
);
565 hits_by_uri
[hit
.Uri
] = hit
;
567 secondary_matches
.AddUri (hit
.Uri
);
570 secondary_matches
.FlushUris ();
572 // Attach all of our secondary properties
577 i
= secondary_matches
.GetNextTrueIndex (j
);
578 if (i
>= secondary_matches
.Count
)
582 Document secondary_doc
;
583 secondary_doc
= secondary_searcher
.Doc (i
);
586 uri
= GetUriFromDocument (secondary_doc
);
587 Logger
.Log
.Debug ("Joining {0}", uri
);
590 hit
= hits_by_uri
[uri
] as Hit
;
592 AddPropertiesToHit (hit
, secondary_doc
, false);
594 final_list_of_hits
.Add (hit
);
600 // If we used the TopScores object, we got our original
601 // list of documents sorted for us. If not, sort the
603 if (top_docs
== null)
604 final_list_of_hits
.Sort ();
608 Logger
.Log
.Debug ("Final list length = {0}", final_list_of_hits
.Count
);
610 // If we have a hit_filter, use it now.
611 if (hit_filter
!= null) {
612 for (int i
= 0; i
< final_list_of_hits
.Count
; ++i
) {
614 hit
= final_list_of_hits
[i
] as Hit
;
615 if (! hit_filter (hit
)) {
616 Logger
.Log
.Debug ("Filtered out {0}", hit
.Uri
);
617 final_list_of_hits
[i
] = null;
622 // Before we broadcast a hit, we strip out any
623 // properties in the PrivateNamespace. We
624 // manipulate the property ArrayList directory,
625 // which is pretty gross... but this is safe,
626 // since removing items will not change the sort
628 foreach (Hit hit
in final_list_of_hits
) {
632 while (i
< hit
.Properties
.Count
) {
633 Property prop
= hit
.Properties
[i
] as Property
;
634 if (prop
.Key
.StartsWith (PrivateNamespace
))
635 hit
.Properties
.RemoveAt (i
);
641 result
.Add (final_list_of_hits
);
647 Logger
.Log
.Debug (">>> GenerateQueryResults time statistics:");
648 Logger
.Log
.Debug (">>> First pass {0} ({1:0.0}%)", a
, 100 * a
.ElapsedTime
/ total
.ElapsedTime
);
649 Logger
.Log
.Debug (">>> Hit assembly {0} ({1:0.0}%)", b
, 100 * b
.ElapsedTime
/ total
.ElapsedTime
);
650 Logger
.Log
.Debug (">>> Final pass {0} ({1:0.0}%)", c
, 100 * c
.ElapsedTime
/ total
.ElapsedTime
);
651 Logger
.Log
.Debug (">>> TOTAL {0}", total
);