2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / LuceneQueryingDriver.cs
blob6df2cfdb9ea5f3564543faee38f8d2d2ab8ae96e
1 //
2 // LuceneQueryingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneQueryingDriver : LuceneCommon {
50 static public bool Debug = false;
52 public const string PrivateNamespace = "_private:";
54 public delegate bool UriFilter (Uri uri);
55 public delegate double RelevancyMultiplier (Hit hit);
57 public LuceneQueryingDriver (string index_name, int minor_version, bool read_only)
58 : base (index_name, minor_version)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
61 if (Exists ())
62 Open (read_only);
63 else if (!read_only)
64 Create ();
65 else {
66 // We're in read-only mode, but we can't create an index.
67 // Maybe a different exception would be better? This one is caught
68 // in QueryDriver.LoadStaticQueryable ()
69 throw new InvalidOperationException ();
72 // Initialize the user text cache only if we're not in
73 // read-only mode. StaticQueryables instantiate their
74 // own text caches that are stored in a separate
75 // location.
76 if (!read_only)
77 text_cache = TextCache.UserCache;
80 ////////////////////////////////////////////////////////////////
83 ////////////////////////////////////////////////////////////////
85 public Uri[] PropertyQuery (Property prop)
87 // FIXME: Should we support scanning the secondary
88 // index as well?
90 IndexReader primary_reader;
91 LNS.IndexSearcher primary_searcher;
93 primary_reader = LuceneCommon.GetReader (PrimaryStore);
94 primary_searcher = new LNS.IndexSearcher (primary_reader);
96 Term term = new Term (PropertyToFieldName (prop.Type, prop.Key), prop.Value);
97 LNS.TermQuery query = new LNS.TermQuery (term);
98 LNS.Hits hits = primary_searcher.Search (query);
100 Uri[] uri_list = new Uri [hits.Length ()];
101 for (int i = 0; i < hits.Length (); i++) {
102 Document doc;
103 doc = hits.Doc (i);
104 uri_list [i] = GetUriFromDocument (doc);
107 primary_searcher.Close ();
108 ReleaseReader (primary_reader);
110 return uri_list;
113 ////////////////////////////////////////////////////////////////
115 // Returns the lowest matching score before the results are
116 // truncated.
117 public void DoQuery (Query query,
118 IQueryResult result,
119 ICollection search_subset_uris, // should be internal uris
120 UriFilter uri_filter,
121 HitFilter hit_filter)
123 Stopwatch sw;
124 sw = new Stopwatch ();
125 sw.Start ();
127 // Assemble all of the parts into a bunch of Lucene queries
129 ArrayList primary_required_part_queries = null;
130 ArrayList secondary_required_part_queries = null;
132 LNS.BooleanQuery primary_prohibited_part_query = null;
133 LNS.BooleanQuery secondary_prohibited_part_query = null;
135 AndHitFilter all_hit_filters;
136 all_hit_filters = new AndHitFilter ();
137 if (hit_filter != null)
138 all_hit_filters.Add (hit_filter);
140 ArrayList term_list = new ArrayList ();
142 foreach (QueryPart part in query.Parts) {
143 LNS.Query primary_part_query;
144 LNS.Query secondary_part_query;
145 HitFilter part_hit_filter;
146 QueryPartToQuery (part,
147 false, // we want both primary and secondary queries
148 part.Logic == QueryPartLogic.Required ? term_list : null,
149 out primary_part_query,
150 out secondary_part_query,
151 out part_hit_filter);
153 if (primary_part_query == null)
154 continue;
156 switch (part.Logic) {
158 case QueryPartLogic.Required:
159 if (primary_required_part_queries == null) {
160 primary_required_part_queries = new ArrayList ();
161 secondary_required_part_queries = new ArrayList ();
163 primary_required_part_queries.Add (primary_part_query);
164 secondary_required_part_queries.Add (secondary_part_query);
166 if (part_hit_filter != null)
167 all_hit_filters.Add (part_hit_filter);
169 break;
171 case QueryPartLogic.Prohibited:
172 if (primary_prohibited_part_query == null)
173 primary_prohibited_part_query = new LNS.BooleanQuery ();
174 primary_prohibited_part_query.Add (primary_part_query, false, false);
176 if (secondary_part_query != null) {
177 if (secondary_prohibited_part_query == null)
178 secondary_prohibited_part_query = new LNS.BooleanQuery ();
179 secondary_prohibited_part_query.Add (secondary_part_query, false, false);
182 if (part_hit_filter != null) {
183 NotHitFilter nhf;
184 nhf = new NotHitFilter (part_hit_filter);
185 all_hit_filters.Add (new HitFilter (nhf.HitFilter));
188 break;
192 // If we have no required parts, give up.
193 if (primary_required_part_queries == null)
194 return;
197 // Now that we have all of these nice queries, let's execute them!
200 // Create the searchers that we will need.
202 IndexReader primary_reader;
203 LNS.IndexSearcher primary_searcher;
204 IndexReader secondary_reader = null;
205 LNS.IndexSearcher secondary_searcher = null;
207 primary_reader = LuceneCommon.GetReader (PrimaryStore);
208 primary_searcher = new LNS.IndexSearcher (primary_reader);
210 if (SecondaryStore != null) {
211 secondary_reader = LuceneCommon.GetReader (SecondaryStore);
212 if (secondary_reader.NumDocs () == 0) {
213 ReleaseReader (secondary_reader);
214 secondary_reader = null;
218 if (secondary_reader != null)
219 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
222 // Possibly create our whitelists from the search subset.
224 LuceneBitArray primary_whitelist = null;
225 LuceneBitArray secondary_whitelist = null;
227 if (search_subset_uris != null && search_subset_uris.Count > 0) {
228 primary_whitelist = new LuceneBitArray (primary_searcher);
229 if (secondary_searcher != null)
230 secondary_whitelist = new LuceneBitArray (secondary_searcher);
232 foreach (Uri uri in search_subset_uris) {
233 primary_whitelist.AddUri (uri);
234 if (secondary_whitelist != null)
235 secondary_whitelist.AddUri (uri);
237 primary_whitelist.FlushUris ();
238 if (secondary_whitelist != null)
239 secondary_whitelist.FlushUris ();
243 // Build blacklists from our prohibited parts.
245 LuceneBitArray primary_blacklist = null;
246 LuceneBitArray secondary_blacklist = null;
248 if (primary_prohibited_part_query != null) {
249 primary_blacklist = new LuceneBitArray (primary_searcher,
250 primary_prohibited_part_query);
252 if (secondary_searcher != null) {
253 secondary_blacklist = new LuceneBitArray (secondary_searcher);
254 if (secondary_prohibited_part_query != null)
255 secondary_blacklist.Or (secondary_prohibited_part_query);
256 primary_blacklist.Join (secondary_blacklist);
261 // Combine our whitelist and blacklist into just a whitelist.
263 if (primary_blacklist != null) {
264 if (primary_whitelist == null) {
265 primary_blacklist.Not ();
266 primary_whitelist = primary_blacklist;
267 } else {
268 primary_whitelist.AndNot (primary_blacklist);
272 if (secondary_blacklist != null) {
273 if (secondary_whitelist == null) {
274 secondary_blacklist.Not ();
275 secondary_whitelist = secondary_blacklist;
276 } else {
277 secondary_whitelist.AndNot (secondary_blacklist);
281 BetterBitArray primary_matches = null;
283 if (primary_required_part_queries != null) {
285 if (secondary_searcher != null)
286 primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
287 secondary_searcher,
288 primary_required_part_queries,
289 secondary_required_part_queries,
290 primary_whitelist,
291 secondary_whitelist);
292 else
293 primary_matches = DoRequiredQueries (primary_searcher,
294 primary_required_part_queries,
295 primary_whitelist);
299 sw.Stop ();
300 if (Debug)
301 Logger.Log.Debug ("###### Finished low-level queries in {0}", sw);
302 sw.Reset ();
303 sw.Start ();
305 // Only generate results if we got some matches
306 if (primary_matches != null && primary_matches.ContainsTrue ()) {
307 GenerateQueryResults (primary_reader,
308 primary_searcher,
309 secondary_searcher,
310 primary_matches,
311 result,
312 term_list,
313 query.MaxHits,
314 uri_filter,
315 new HitFilter (all_hit_filters.HitFilter));
319 // Finally, we clean up after ourselves.
322 primary_searcher.Close ();
323 if (secondary_searcher != null)
324 secondary_searcher.Close ();
325 ReleaseReader (primary_reader);
326 if (secondary_reader != null)
327 ReleaseReader (secondary_reader);
330 sw.Stop ();
331 if (Debug)
332 Logger.Log.Debug ("###### Processed query in {0}", sw);
336 ////////////////////////////////////////////////////////////////
339 // Special logic for handling our set of required queries
342 // This is the easy case: we just combine all of the queries
343 // into one big BooleanQuery.
344 private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher,
345 ArrayList primary_queries,
346 BetterBitArray primary_whitelist)
348 LNS.BooleanQuery combined_query;
349 combined_query = new LNS.BooleanQuery ();
350 foreach (LNS.Query query in primary_queries)
351 combined_query.Add (query, true, false);
353 LuceneBitArray matches;
354 matches = new LuceneBitArray (primary_searcher, combined_query);
355 if (primary_whitelist != null)
356 matches.And (primary_whitelist);
358 return matches;
361 // This code attempts to execute N required queries in the
362 // most efficient order to minimize the amount of time spent
363 // joining between the two indexes. It returns a joined bit
364 // array of matches against the primary index.
366 private class MatchInfo : IComparable {
368 public LuceneBitArray PrimaryMatches = null;
369 public LuceneBitArray SecondaryMatches = null;
370 public int UpperBound = 0;
372 public void Join ()
374 PrimaryMatches.Join (SecondaryMatches);
377 public void RestrictBy (MatchInfo joined)
379 if (joined != null) {
380 this.PrimaryMatches.And (joined.PrimaryMatches);
381 this.SecondaryMatches.And (joined.SecondaryMatches);
384 UpperBound = 0;
385 UpperBound += PrimaryMatches.TrueCount;
386 UpperBound += SecondaryMatches.TrueCount;
389 public int CompareTo (object obj)
391 MatchInfo other = (MatchInfo) obj;
392 return this.UpperBound - other.UpperBound;
396 // Any whitelists that are passed in must be fully joined, or
397 // query results will be incorrect.
398 private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher,
399 LNS.IndexSearcher secondary_searcher,
400 ArrayList primary_queries,
401 ArrayList secondary_queries,
402 BetterBitArray primary_whitelist,
403 BetterBitArray secondary_whitelist)
405 ArrayList match_info_list;
406 match_info_list = new ArrayList ();
408 // First, do all of the low-level queries
409 // and store them in our MatchInfo
410 for (int i = 0; i < primary_queries.Count; ++i) {
411 LNS.Query pq, sq;
412 pq = primary_queries [i] as LNS.Query;
413 sq = secondary_queries [i] as LNS.Query;
415 LuceneBitArray p_matches = null, s_matches = null;
416 p_matches = new LuceneBitArray (primary_searcher);
417 if (pq != null) {
418 p_matches.Or (pq);
419 if (primary_whitelist != null)
420 p_matches.And (primary_whitelist);
423 s_matches = new LuceneBitArray (secondary_searcher);
424 if (sq != null) {
425 s_matches.Or (sq);
426 if (secondary_whitelist != null)
427 s_matches.And (secondary_whitelist);
430 MatchInfo info;
431 info = new MatchInfo ();
432 info.PrimaryMatches = p_matches;
433 info.SecondaryMatches = s_matches;
434 info.RestrictBy (null); // a hack to initialize the UpperBound
435 match_info_list.Add (info);
438 // We want to be smart about the order we do this in,
439 // to minimize the expense of the Join.
440 while (match_info_list.Count > 1) {
442 // linear scan to find the minimum
443 int index_min = 0;
444 for (int i = 1; i < match_info_list.Count; ++i)
445 if (((MatchInfo) match_info_list [i]).CompareTo ((MatchInfo) match_info_list [index_min]) < 0)
446 index_min = i;
448 MatchInfo smallest;
449 smallest = match_info_list [index_min] as MatchInfo;
450 match_info_list.RemoveAt (index_min);
452 // We can short-circuit if our smallest set of
453 // matches is empty.
454 if (smallest.UpperBound == 0)
455 return smallest.PrimaryMatches; // this must be an empty array.
457 smallest.Join ();
459 foreach (MatchInfo info in match_info_list)
460 info.RestrictBy (smallest);
463 // For the final pair, we don't need to do a full join:
464 // mapping the secondary onto the primary is sufficient
465 MatchInfo last;
466 last = match_info_list [0] as MatchInfo;
467 last.SecondaryMatches.ProjectOnto (last.PrimaryMatches);
469 return last.PrimaryMatches;
472 ////////////////////////////////////////////////////////////////
474 static private void ScoreHits (Hashtable hits_by_id,
475 IndexReader reader,
476 ICollection term_list)
478 Stopwatch sw;
479 sw = new Stopwatch ();
480 sw.Start ();
482 LNS.Similarity similarity;
483 similarity = LNS.Similarity.GetDefault ();
485 foreach (Term term in term_list) {
487 double idf;
488 idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ());
490 int hit_count;
491 hit_count = hits_by_id.Count;
493 TermDocs term_docs;
494 term_docs = reader.TermDocs (term);
495 while (term_docs.Next () && hit_count > 0) {
497 int id;
498 id = term_docs.Doc ();
500 Hit hit;
501 hit = hits_by_id [id] as Hit;
502 if (hit != null) {
503 double tf;
504 tf = similarity.Tf (term_docs.Freq ());
505 hit.Score += tf * idf;
506 --hit_count;
511 sw.Stop ();
514 ////////////////////////////////////////////////////////////////
516 private class DocAndId {
517 public Document Doc;
518 public int Id;
522 // Given a set of hits, broadcast some set out as our query
523 // results.
526 private static void GenerateQueryResults (IndexReader primary_reader,
527 LNS.IndexSearcher primary_searcher,
528 LNS.IndexSearcher secondary_searcher,
529 BetterBitArray primary_matches,
530 IQueryResult result,
531 ICollection query_term_list,
532 int max_results,
533 UriFilter uri_filter,
534 HitFilter hit_filter)
536 TopScores top_docs = null;
537 ArrayList all_docs = null;
539 if (Debug)
540 Logger.Log.Debug (">>> Initially handed {0} matches", primary_matches.TrueCount);
542 if (primary_matches.TrueCount <= max_results) {
543 if (Debug)
544 Logger.Log.Debug (">>> Initial count is within our limit of {0}", max_results);
545 all_docs = new ArrayList ();
546 } else {
547 if (Debug)
548 Logger.Log.Debug (">>> Number of hits is capped at {0}", max_results);
549 top_docs = new TopScores (max_results);
552 Stopwatch total, a, b, c;
553 total = new Stopwatch ();
554 a = new Stopwatch ();
555 b = new Stopwatch ();
556 c = new Stopwatch ();
558 total.Start ();
559 a.Start ();
561 // Pull in the primary documents.
562 // We walk across them backwards, since newer
563 // documents are more likely to be at the end of
564 // the index.
565 int j = primary_matches.Count;
566 while (true) {
567 int i;
568 i = primary_matches.GetPreviousTrueIndex (j);
569 if (i < 0)
570 break;
571 j = i-1; // This way we can't forget to adjust i
573 Document doc;
574 doc = primary_searcher.Doc (i);
576 // Check the timestamp --- if we have already reached our
577 // limit, we might be able to reject it immediately.
578 string timestamp_str;
579 long timestamp_num = 0;
581 timestamp_str = doc.Get ("Timestamp");
582 if (timestamp_str == null) {
583 Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc));
584 } else {
585 timestamp_num = Int64.Parse (doc.Get ("Timestamp"));
586 if (top_docs != null && ! top_docs.WillAccept (timestamp_num))
587 continue;
590 // If we have a UriFilter, apply it.
591 if (uri_filter != null) {
592 Uri uri;
593 uri = GetUriFromDocument (doc);
594 if (! uri_filter (uri))
595 continue;
598 DocAndId doc_and_id = new DocAndId ();
599 doc_and_id.Doc = doc;
600 doc_and_id.Id = i;
602 // Add the document to the appropriate data structure.
603 // We use the timestamp_num as the score, so high
604 // scores correspond to more-recent timestamps.
605 if (all_docs != null)
606 all_docs.Add (doc_and_id);
607 else
608 top_docs.Add (timestamp_num, doc_and_id);
611 a.Stop ();
613 b.Start ();
615 ICollection final_list_of_docs;
616 if (all_docs != null)
617 final_list_of_docs = all_docs;
618 else
619 final_list_of_docs = top_docs.TopScoringObjects;
621 ArrayList final_list_of_hits;
622 final_list_of_hits = new ArrayList (final_list_of_docs.Count);
624 // This is used only for scoring
625 Hashtable hits_by_id = null;
626 hits_by_id = new Hashtable ();
628 // If we aren't using the secondary index, the next step is
629 // very straightforward.
630 if (secondary_searcher == null) {
632 foreach (DocAndId doc_and_id in final_list_of_docs) {
633 Hit hit;
634 hit = DocumentToHit (doc_and_id.Doc);
635 hits_by_id [doc_and_id.Id] = hit;
636 final_list_of_hits.Add (hit);
639 } else {
641 if (Debug)
642 Logger.Log.Debug (">>> Performing cross-index Hit reunification");
644 Hashtable hits_by_uri;
645 hits_by_uri = UriFu.NewHashtable ();
647 LuceneBitArray secondary_matches;
648 secondary_matches = new LuceneBitArray (secondary_searcher);
650 foreach (DocAndId doc_and_id in final_list_of_docs) {
651 Hit hit;
652 hit = DocumentToHit (doc_and_id.Doc);
653 hits_by_id [doc_and_id.Id] = hit;
654 hits_by_uri [hit.Uri] = hit;
655 secondary_matches.AddUri (hit.Uri);
658 secondary_matches.FlushUris ();
660 // Attach all of our secondary properties
661 // to the hits
662 j = 0;
663 while (true) {
664 int i;
665 i = secondary_matches.GetNextTrueIndex (j);
666 if (i >= secondary_matches.Count)
667 break;
668 j = i+1;
670 Document secondary_doc;
671 secondary_doc = secondary_searcher.Doc (i);
673 Uri uri;
674 uri = GetUriFromDocument (secondary_doc);
676 Hit hit;
677 hit = hits_by_uri [uri] as Hit;
679 AddPropertiesToHit (hit, secondary_doc, false);
681 final_list_of_hits.Add (hit);
685 ScoreHits (hits_by_id, primary_reader, query_term_list);
687 b.Stop ();
689 // If we used the TopScores object, we got our original
690 // list of documents sorted for us. If not, sort the
691 // final list.
692 if (top_docs == null)
693 final_list_of_hits.Sort ();
695 c.Start ();
697 // If we have a hit_filter, use it now.
698 if (hit_filter != null) {
699 for (int i = 0; i < final_list_of_hits.Count; ++i) {
700 Hit hit;
701 hit = final_list_of_hits [i] as Hit;
702 if (! hit_filter (hit)) {
703 if (Debug)
704 Logger.Log.Debug ("Filtered out {0}", hit.Uri);
705 final_list_of_hits [i] = null;
710 // Before we broadcast a hit, we strip out any
711 // properties in the PrivateNamespace. We
712 // manipulate the property ArrayList directory,
713 // which is pretty gross... but this is safe,
714 // since removing items will not change the sort
715 // order.
716 foreach (Hit hit in final_list_of_hits) {
717 if (hit == null)
718 continue;
719 int i = 0;
720 while (i < hit.Properties.Count) {
721 Property prop = hit.Properties [i] as Property;
722 if (prop.Key.StartsWith (PrivateNamespace))
723 hit.Properties.RemoveAt (i);
724 else
725 ++i;
729 result.Add (final_list_of_hits);
731 c.Stop ();
732 total.Stop ();
734 if (Debug) {
735 Logger.Log.Debug (">>> GenerateQueryResults time statistics:");
736 Logger.Log.Debug (">>> First pass {0} ({1:0.0}%)", a, 100 * a.ElapsedTime / total.ElapsedTime);
737 Logger.Log.Debug (">>> Hit assembly {0} ({1:0.0}%)", b, 100 * b.ElapsedTime / total.ElapsedTime);
738 Logger.Log.Debug (">>> Final pass {0} ({1:0.0}%)", c, 100 * c.ElapsedTime / total.ElapsedTime);
739 Logger.Log.Debug (">>> TOTAL {0}", total);