4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
33 using System
.Collections
;
34 using System
.Diagnostics
;
35 using System
.Globalization
;
38 using System
.Threading
;
40 using System
.Xml
.Serialization
;
42 using Lucene
.Net
.Analysis
;
43 using Lucene
.Net
.Analysis
.Standard
;
44 using Lucene
.Net
.Documents
;
45 using Lucene
.Net
.Index
;
46 using Lucene
.Net
.QueryParsers
;
47 using LNS
= Lucene
.Net
.Search
;
51 namespace Beagle
.Daemon
{
53 public class LuceneDriver
: IIndexer
{
55 public delegate bool UriFilter (Uri uri
);
56 public delegate Uri
UriRemapper (Uri uri
);
57 public delegate Hit
HitProcessor (Hit hit
);
58 public delegate double RelevancyMultiplier (Hit hit
);
60 public event IIndexerChangedHandler ChangedEvent
;
61 public event IIndexerChildIndexableHandler ChildIndexableEvent
;
62 public event IIndexerUrisFilteredHandler UrisFilteredEvent
;
64 /////////////////////////////////////////////////////
67 // 2: Changed format of timestamp strings
68 // 3: Schema changed to be more Dashboard-Match-like
69 // 4: Schema changed for files to include _Directory property
70 // 5: Changed analyzer to support stemming. Bumped version # to
71 // force everyone to re-index.
72 // 6: lots of schema changes as part of the general refactoring
73 // 7: incremented to force a re-index after our upgrade to lucene 1.4
74 // (in theory the file formats are compatible, we are seeing 'term
75 // out of order' exceptions in some cases)
76 // 8: another forced re-index, this time because of massive changes
77 // in the file system backend (it would be nice to have per-backend
78 // versioning so that we didn't have to purge all indexes just
79 // because one changed)
80 // 9: changed the way properties are stored, changed in conjunction
81 // with sane handling of multiple properties on hits.
82 private const int MAJOR_VERSION
= 9;
83 private int minor_version
= 0;
85 private string top_dir
;
86 private Hashtable pending_by_uri
= UriFu
.NewHashtable ();
87 private int pending_adds
= 0;
88 private int pending_removals
= 0;
89 private int cycles_since_last_optimization
= 0;
90 private bool optimizing
= false;
91 private int last_item_count
= -1;
93 private TextCache text_cache
= TextCache
.UserCache
;
95 public TextCache TextCache
{
96 get { return text_cache; }
97 set { text_cache = value; }
100 /////////////////////////////////////////////////////
102 public LuceneDriver (string index_name
) : this (index_name
, -1, false) { }
104 public LuceneDriver (string index_name
, string index_dir
) : this (index_name
, -1, false) { }
106 public LuceneDriver (string index_name
, bool disable_locking
) : this (index_name
, -1, disable_locking
) { }
108 public LuceneDriver (string index_name
, int index_version
) : this (index_name
, index_version
, false) { }
110 public LuceneDriver (string index_name
, int index_version
, bool disable_locking
)
112 Setup (index_name
, index_version
, disable_locking
);
115 public string IndexDirectory
{
116 get { return top_dir; }
119 /////////////////////////////////////////////////////
125 private Lucene
.Net
.Store
.Directory ourStore
= null;
126 private string ourStorePath
= null;
129 public Lucene
.Net
.Store
.Directory Store
{
130 get { return ourStore; }
133 public string StorePath
{
134 get { return ourStorePath; }
137 /////////////////////////////////////////////////////
139 private void Setup (string index_name
, int minor_version
, bool disable_locking
)
141 top_dir
= Path
.Combine (PathFinder
.IndexDir
, index_name
);
143 string versionFile
= Path
.Combine (top_dir
, "version");
144 string fingerprintFile
= Path
.Combine (top_dir
, "fingerprint");
145 string lockDir
= Path
.Combine (top_dir
, "Locks");
146 string indexDir
= Path
.Combine (top_dir
, "Index");
147 string indexTestFile
= Path
.Combine (indexDir
, "segments");
149 bool versionExists
= File
.Exists (versionFile
);
150 bool fingerprintExists
= File
.Exists (fingerprintFile
);
151 bool indexExists
= File
.Exists (indexTestFile
);
153 // Check the index's version number. If it is wrong,
154 // declare the index non-existent.
155 if (versionExists
&& indexExists
) {
156 StreamReader sr
= new StreamReader (versionFile
);
157 string versionStr
= sr
.ReadLine ();
160 int old_major_version
, old_minor_version
;
161 int i
= versionStr
.IndexOf (".");
164 old_major_version
= Convert
.ToInt32 (versionStr
.Substring (0,i
));
165 old_minor_version
= Convert
.ToInt32 (versionStr
.Substring (i
+1));
167 old_major_version
= Convert
.ToInt32 (versionStr
);
168 old_minor_version
= 0;
171 if (minor_version
>= 0 && (old_major_version
!= MAJOR_VERSION
|| old_minor_version
!= minor_version
)) {
172 log
.Debug ("Version mismatch in {0}", index_name
);
173 log
.Debug ("Index has version {0}.{1}, expected {2}.{3}",
174 old_major_version
, old_minor_version
,
175 MAJOR_VERSION
, minor_version
);
180 // If there is no fingerprint file, declare the index
182 if (indexExists
&& ! fingerprintExists
)
185 // If the index seems to exist but contains dangling locks,
186 // declare the index non-existent.
189 DirectoryInfo lockDirInfo
= new DirectoryInfo (lockDir
);
190 if (! lockDirInfo
.Exists
)
193 foreach (FileInfo info
in lockDirInfo
.GetFiles ()) {
194 if (info
.Name
.IndexOf ("write.lock") != -1) {
200 log
.Debug ("Found dangling locks in {0}", lockDir
);
205 // Read in the fingerprint
206 StreamReader sr
= new StreamReader (fingerprintFile
);
207 fingerprint
= sr
.ReadLine ();
210 // We can't create the index if we are in
211 // read-only mode obviously.
213 throw new InvalidOperationException ("LuceneDriver is in read-only mode, but requires index-creation");
215 //if (minor_version < 0)
216 // throw new InvalidOperationException ("LuceneDriver was passed an unspecified minor_version, and index-creation is required.");
218 // FIXME: This isnt exactly the way to do it
219 if (minor_version
< 0)
222 // Purge and rebuild the index's directory
225 if (Directory
.Exists (top_dir
)) {
226 log
.Debug ("Purging {0}", top_dir
);
227 Directory
.Delete (top_dir
, true);
230 // Create all directories.
231 Directory
.CreateDirectory (top_dir
);
232 Directory
.CreateDirectory (lockDir
);
233 Directory
.CreateDirectory (indexDir
);
237 // Generate a fingerprint and write it out
238 fingerprint
= Guid
.NewGuid ().ToString ();
239 sw
= new StreamWriter (fingerprintFile
, false);
240 sw
.WriteLine (fingerprint
);
243 // Write out our version information
244 sw
= new StreamWriter (versionFile
, false);
245 sw
.WriteLine ("{0}.{1}", MAJOR_VERSION
, minor_version
);
249 Lucene
.Net
.Store
.FSDirectory store
;
251 store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (indexDir
, lockDir
, false, disable_locking
);
254 ourStorePath
= indexDir
;
259 // This creates the index if it doesn't exist
260 IndexWriter writer
= new IndexWriter (Store
, null, true);
264 if (Environment
.GetEnvironmentVariable ("BEAGLE_OPTIMIZE_ON_STARTUP") != null) {
265 cycles_since_last_optimization
= 1000; // this can't be zero, or nothing will happen
270 /////////////////////////////////////////////////////
276 private static Logger log
= Logger
.Get ("lucene");
278 private static Logger Log
{
282 /////////////////////////////////////////////////////
285 // The Index's Fingerprint and up-to-date checking
288 private string fingerprint
= null;
290 public string Fingerprint
{
291 get { return fingerprint; }
294 /////////////////////////////////////////////////////
297 // Public Indexing API
300 static object [] empty_collection
= new object [0];
302 public void Add (Indexable indexable
)
304 Uri uri
= indexable
.Uri
;
306 lock (pending_by_uri
) {
307 if (pending_by_uri
.Contains (uri
) && pending_by_uri
[uri
] == null)
309 pending_by_uri
[uri
] = indexable
;
314 public void Remove (Uri uri
)
316 lock (pending_by_uri
) {
317 if (pending_by_uri
[uri
] != null)
319 pending_by_uri
[uri
] = null;
324 public void Rename (Uri old_uri
, Uri new_uri
)
326 Logger
.Log
.Error ("**** LuceneDriver does not support Rename!");
327 Logger
.Log
.Error ("**** old_uri={0}", old_uri
);
328 Logger
.Log
.Error ("**** new_uri={0}", new_uri
);
331 public int PendingAdds
{
332 get { return pending_adds; }
335 public int PendingRemovals
{
336 get { return pending_removals; }
341 ArrayList pending_uris
;
342 ArrayList pending_indexables
;
344 ArrayList added_uris
;
345 ArrayList removed_uris
;
346 ArrayList filtered_uris
;
348 lock (pending_by_uri
) {
350 if (pending_by_uri
.Count
== 0)
353 pending_uris
= new ArrayList ();
354 pending_indexables
= new ArrayList ();
355 added_uris
= new ArrayList ();
356 removed_uris
= new ArrayList ();
357 filtered_uris
= new ArrayList ();
359 // Move our indexables and remove requests out of the
360 // hash and into local data structures.
361 foreach (DictionaryEntry entry
in pending_by_uri
) {
362 Uri uri
= (Uri
) entry
.Key
;
363 Indexable indexable
= (Indexable
) entry
.Value
;
365 pending_uris
.Add (uri
);
366 if (indexable
!= null)
367 pending_indexables
.Add (indexable
);
369 if (indexable
!= null)
370 added_uris
.Add (uri
);
372 removed_uris
.Add (uri
);
376 pending_removals
= 0;
377 pending_by_uri
.Clear ();
381 int removal_count
= 0;
383 Log
.Debug ("Flushing...");
385 Stopwatch watch
= new Stopwatch ();
387 // Step #1: Delete all items with the same URIs
388 // as our pending items from the index.
390 IndexReader reader
= IndexReader
.Open (Store
);
391 foreach (Uri uri
in pending_uris
) {
392 log
.Debug ("- {0}", uri
);
394 Term term
= new Term ("Uri", uri
.ToString ());
395 reader
.Delete (term
);
398 term
= new Term ("ParentUri", uri
.ToString ());
399 removal_count
+= reader
.Delete (term
);
401 last_item_count
= reader
.NumDocs ();
404 Log
.Debug ("Lucene Delete: {0} {1} {2}", watch
, pending_uris
.Count
,
405 pending_uris
.Count
/ watch
.ElapsedTime
);
408 // Step #2: Write out the pending adds
410 IndexWriter writer
= null;
412 // FIXME: Change this to a queue so that items
413 // that are added while we flush (like children) also
416 foreach (Indexable indexable
in pending_indexables
) {
418 Log
.Debug ("+ {0}", indexable
.DisplayUri
);
420 Filter filter
= null;
423 FilterFactory
.FilterIndexable (indexable
, this.text_cache
, out filter
);
424 } catch (Exception e
) {
425 Log
.Error ("Unable to filter {0} (mimetype={1})", indexable
.DisplayUri
, indexable
.MimeType
);
431 doc
= ToLuceneDocument (indexable
);
432 } catch (Exception e
) {
433 Log
.Error ("Unable to convert {0} (type={1}) to a lucene document",
434 indexable
.Uri
, indexable
.Type
);
440 writer
= new IndexWriter (Store
, Analyzer
, false);
441 writer
.AddDocument (doc
);
446 if (filter
!= null) {
447 filtered_uris
.Add (FilteredStatus
.New (indexable
, filter
));
450 if (filter
!= null && filter
.ChildIndexables
.Count
> 0) {
451 // Iterate across any indexables created by the
452 // filter and set up the parent-child relationship.
453 foreach (Indexable child
in filter
.ChildIndexables
)
454 child
.SetChildOf (indexable
);
456 // If nobody is listening for ChildIndexableEvent,
457 // just add them ourselves so we wont loose them
458 if (ChildIndexableEvent
!= null)
459 ChildIndexableEvent ((Indexable
[]) filter
.ChildIndexables
.ToArray (typeof (Indexable
)));
461 foreach (Indexable child
in filter
.ChildIndexables
)
468 Log
.Debug ("Lucene Add: {0} {1} {2}", watch
, pending_indexables
.Count
,
469 pending_indexables
.Count
/ watch
.ElapsedTime
);
472 // Step #3: Fire off an event telling what we just did.
473 if (ChangedEvent
!= null) {
474 ChangedEvent (this, added_uris
, removed_uris
, empty_collection
);
477 if (filtered_uris
.Count
> 0 && UrisFilteredEvent
!= null) {
478 UrisFilteredEvent ((FilteredStatus
[]) filtered_uris
.ToArray (typeof (FilteredStatus
)));
481 lock (pending_by_uri
)
482 cycles_since_last_optimization
++;
488 private bool NeedsOptimize
{
490 // FIXME: 19 is a totally arbitrary number.
491 return cycles_since_last_optimization
> 19;
495 private void Optimize ()
497 // If nothing has happened since our last optimization,
499 // If this index is already being optimized, don't
500 // optimize it again.
501 lock (pending_by_uri
) {
502 if (optimizing
|| cycles_since_last_optimization
== 0)
507 Log
.Debug ("Optimizing {0}...", StorePath
);
509 Stopwatch watch
= new Stopwatch ();
512 IndexWriter writer
= new IndexWriter (Store
, null, false);
518 Log
.Debug ("Optimization time for {0}: {1}", StorePath
, watch
);
520 lock (pending_by_uri
) {
522 cycles_since_last_optimization
= 0;
526 /////////////////////////////////////////////////////
528 // Returns the lowest matching score before the results are
530 public void DoQuery (Query query
,
532 ICollection search_subset
, // should be internal uris
533 ICollection bonus_uris
, // should be internal uris
534 UriFilter uri_filter
,
535 UriRemapper uri_remapper
, // map to external uris
536 HitProcessor hit_processor
, // post-process hits
537 RelevancyMultiplier relevancy_multiplier
)
542 LNS
.Query lucene_query
= ToLuceneQuery (query
, search_subset
, bonus_uris
);
543 if (lucene_query
== null)
546 Stopwatch sw
= new Stopwatch ();
548 LNS
.Searcher searcher
= new LNS
.IndexSearcher (Store
);
549 LNS
.Hits hits
= searcher
.Search (lucene_query
);
552 t_lucene
= sw
.ElapsedTime
;
554 //////////////////////////////////////
559 int n_hits
= hits
.Length ();
563 for (int i
= 0; i
< n_hits
; ++i
) {
564 Document doc
= hits
.Doc (i
);
566 if (uri_filter
!= null) {
567 Uri uri
= UriFromLuceneDoc (doc
);
568 if (! uri_filter (uri
))
572 double score
= (double) hits
.Score (i
);
574 if (result
.WillReject (score
)) {
575 log
.Debug ("Terminating DoQuery at {0} of {1} (score={2})", i
, n_hits
, score
);
579 Hit hit
= FromLuceneDocToHit (doc
, hits
.Id (i
), score
);
580 if (uri_remapper
!= null)
581 hit
.Uri
= uri_remapper (hit
.Uri
);
583 if (hit_processor
!= null)
584 hit
= hit_processor (hit
);
586 if (relevancy_multiplier
!= null) {
587 double m
= relevancy_multiplier (hit
);
588 hit
.ScoreMultiplier
= (float) m
;
596 t_assembly
= sw
.ElapsedTime
;
598 //////////////////////////////////////
602 log
.Debug ("{0}: n_hits={1} lucene={2:0.00}s assembly={3:0.00}s",
603 StorePath
, n_hits
, t_lucene
, t_assembly
);
606 // FIXME: This should support Uri filtering, Uri remapping, etc.
607 public ICollection
DoQueryByUri (ICollection list_of_uris
)
609 LNS
.BooleanQuery uri_query
= new LNS
.BooleanQuery ();
610 LNS
.Searcher searcher
;
611 LNS
.Hits lucene_hits
;
612 ArrayList all_hits
= new ArrayList ();
614 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
615 int clause_count
= 0;
617 foreach (Uri uri
in list_of_uris
) {
618 Term term
= new Term ("Uri", uri
.ToString ());
619 LNS
.Query term_query
= new LNS
.TermQuery (term
);
620 uri_query
.Add (term_query
, false, false);
623 if (clause_count
== max_clauses
) {
624 searcher
= new LNS
.IndexSearcher (Store
);
625 lucene_hits
= searcher
.Search (uri_query
);
626 int n_hits
= lucene_hits
.Length ();
628 for (int i
= 0; i
< n_hits
; ++i
) {
629 Hit hit
= FromLuceneDocToHit (lucene_hits
.Doc (i
),
631 lucene_hits
.Score (i
));
636 uri_query
= new LNS
.BooleanQuery ();
641 if (clause_count
> 0) {
642 searcher
= new LNS
.IndexSearcher (Store
);
643 lucene_hits
= searcher
.Search (uri_query
);
644 int n_hits
= lucene_hits
.Length ();
646 for (int i
= 0; i
< n_hits
; ++i
) {
647 Hit hit
= FromLuceneDocToHit (lucene_hits
.Doc (i
),
649 lucene_hits
.Score (i
));
659 public ICollection
DoQueryByUri (Uri uri
)
661 return DoQueryByUri (new Uri
[1] { uri }
);
664 // We cache the number of documents in the index when readers are
665 // available, so calls to GetItemCount will return immediately
666 // if the driver has been flushed or queried.
667 public int GetItemCount ()
669 if (last_item_count
< 0) {
670 IndexReader reader
= IndexReader
.Open (Store
);
671 last_item_count
= reader
.NumDocs ();
674 return last_item_count
;
678 ///////////////////////////////////////////////////////////////////////////////////////
681 // Code to map to/from Lucene data types
684 static char PropertyTypeToCode (PropertyType type
)
687 case PropertyType
.Text
: return '_';
688 case PropertyType
.Keyword
: return 'k';
689 case PropertyType
.Date
: return 'd';
692 throw new Exception ("Unknown property type: " + type
);
695 static PropertyType
CodeToPropertyType (char c
)
698 case '_': return PropertyType
.Text
;
699 case 'k': return PropertyType
.Keyword
;
700 case 'd': return PropertyType
.Date
;
703 throw new Exception ("Unknown property code: " + c
);
706 static string PropertyTypeToWildcardField (PropertyType type
)
709 case PropertyType
.Text
: return "PropertyText";
710 case PropertyType
.Keyword
: return "PropertyKeyword";
711 case PropertyType
.Date
: return "PropertyDate";
717 private void AddPropertyToLuceneDocument (Document doc
, Property prop
)
719 if (prop
.Value
== null)
725 is_text
= prop
.Type
== PropertyType
.Text
;
727 if (prop
.IsSearched
) {
728 string wildcard_field
;
729 wildcard_field
= PropertyTypeToWildcardField (prop
.Type
);
730 if (wildcard_field
!= null) {
731 f
= new Field (wildcard_field
,
733 false, // never stored
734 true, // always index
735 is_text
); // only tokenize text
740 f
= new Field (String
.Format ("prop:{0}:{1}",
741 PropertyTypeToCode (prop
.Type
),
744 true, // always store
745 true, // always index
746 is_text
); // only tokenize text
750 private Property
FieldToProperty (Field field
)
752 string name
= field
.Name ();
753 if (name
.Length
< 7 || ! name
.StartsWith ("prop:"))
756 string key
= name
.Substring (7);
757 string value = field
.StringValue ();
760 return Property
.NewKeyword (key
, value);
761 else if (name
[5] == 'd')
762 return Property
.NewDateFromString (key
, value);
763 else if (name
[6] == 's')
764 return Property
.NewUnsearched (key
, value);
766 return Property
.New (key
, value);
769 private Document
ToLuceneDocument (Indexable indexable
)
771 Document doc
= new Document ();
776 // First we add the Indexable's 'canonical' properties
779 f
= Field
.Keyword ("Uri", UriFu
.UriToSerializableString (indexable
.Uri
));
782 f
= Field
.Keyword ("Type", indexable
.Type
);
785 if (indexable
.ParentUri
!= null) {
786 f
= Field
.Keyword ("ParentUri", UriFu
.UriToSerializableString (indexable
.ParentUri
));
790 if (indexable
.MimeType
!= null) {
791 f
= Field
.Keyword ("MimeType", indexable
.MimeType
);
795 if (indexable
.ValidTimestamp
) {
796 str
= StringFu
.DateTimeToString (indexable
.Timestamp
);
797 f
= Field
.Keyword ("Timestamp", str
);
801 if (! indexable
.NoContent
) {
803 reader
= indexable
.GetTextReader ();
804 if (reader
!= null) {
805 f
= Field
.Text ("Text", reader
);
809 reader
= indexable
.GetHotTextReader ();
810 if (reader
!= null) {
811 f
= Field
.Text ("HotText", reader
);
816 foreach (Property prop
in indexable
.Properties
)
817 AddPropertyToLuceneDocument (doc
, prop
);
822 static public LNS
.Query
ToUriQuery (ICollection list_of_uris
, UriRemapper remapper
)
824 if (list_of_uris
== null || list_of_uris
.Count
== 0)
827 LNS
.BooleanQuery query
= new LNS
.BooleanQuery ();
828 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
829 int clause_count
= 0;
831 foreach (Uri original_uri
in list_of_uris
) {
832 Uri uri
= original_uri
;
833 if (remapper
!= null)
834 uri
= remapper (uri
);
835 //Logger.Log.Debug ("ToUriQuery: {0} => {1}", original_uri, uri);
836 Term term
= new Term ("Uri", uri
.ToString ()); // FIXME: Do we need some UriFu here?
837 LNS
.Query term_query
= new LNS
.TermQuery (term
);
838 query
.Add (term_query
, false, false);
840 // If we have to many clases, nest the queries
841 if (clause_count
== max_clauses
) {
842 LNS
.BooleanQuery new_query
= new LNS
.BooleanQuery ();
843 new_query
.Add (query
, false, false);
852 static public LNS
.Query
ToUriQuery (ICollection list_of_uris
)
854 return ToUriQuery (list_of_uris
, null);
857 static private LNS
.Query
NewTokenizedQuery (string field
, string text
)
859 ArrayList tokens
= new ArrayList ();
861 // Use the analyzer to extract the query's tokens.
862 // This code is taken from Lucene's query parser.
863 // We use the standard Analyzer.
864 TokenStream source
= LuceneDriver
.Analyzer
.TokenStream (field
, new StringReader (text
));
866 Lucene
.Net
.Analysis
.Token t
;
869 } catch (IOException
) {
874 tokens
.Add (t
.TermText ());
878 } catch (IOException
) {
883 if (tokens
.Count
== 1) {
884 Term t
= new Term (field
, (string) tokens
[0]);
885 q
= new LNS
.TermQuery (t
);
886 } else if (tokens
.Count
> 1) {
887 q
= new LNS
.PhraseQuery ();
888 foreach (string tokenStr
in tokens
) {
889 Term t
= new Term (field
, tokenStr
);
890 ((LNS
.PhraseQuery
) q
).Add (t
);
897 // search_subset limits the score of our search to that set of Uris
898 // bonus_uris are always matched by the query
899 private LNS
.Query
ToLuceneQuery (Query query
,
900 ICollection search_subset
,
901 ICollection bonus_uris
)
903 LNS
.BooleanQuery body_query
= null;
904 LNS
.Query search_subset_query
= null;
905 LNS
.Query bonus_uris_query
= null;
906 LNS
.BooleanQuery mime_type_query
= null;
907 LNS
.BooleanQuery hit_type_query
= null;
909 body_query
= new LNS
.BooleanQuery ();
911 bool used_any_part
= false;
913 foreach (QueryPart part
in query
.Parts
) {
915 LNS
.BooleanQuery part_query
= new LNS
.BooleanQuery ();
916 LNS
.Query part_query_override
= null;
917 LNS
.Query subquery
= null;
919 bool used_this_part
= false;
921 if (part
.TargetIsAll
|| part
.TargetIsText
) {
923 subquery
= NewTokenizedQuery ("Text", part
.Text
);
924 if (subquery
!= null) {
925 part_query
.Add (subquery
, false, false);
926 used_this_part
= true;
929 subquery
= NewTokenizedQuery ("HotText", part
.Text
);
930 if (subquery
!= null) {
931 subquery
.SetBoost (1.75f
);
932 part_query
.Add (subquery
, false, false);
933 used_this_part
= true;
937 if (part
.TargetIsAll
|| part
.TargetIsProperties
) {
938 subquery
= NewTokenizedQuery ("PropertyText", part
.Text
);
939 if (subquery
!= null) {
940 subquery
.SetBoost (1.75f
);
941 part_query
.Add (subquery
, false, false);
942 used_this_part
= true;
946 if (part
.TargetIsSpecificProperty
) {
949 prop_name
= String
.Format ("prop:{0}:{1}",
950 part
.IsKeyword
? 'k' : '_',
953 if (part
.IsKeyword
) {
954 Term term
= new Term (prop_name
, part
.Text
);
955 subquery
= new LNS
.TermQuery (term
);
957 subquery
= NewTokenizedQuery (prop_name
, part
.Text
);
960 // Instead of the boolean query, just use the subquery.
961 if (subquery
!= null) {
962 part_query_override
= subquery
;
963 used_this_part
= true;
967 if (used_this_part
) {
968 if (part_query_override
== null)
969 part_query_override
= part_query
;
970 body_query
.Add (part_query_override
, part
.IsRequired
, part
.IsProhibited
);
971 used_any_part
= true;
978 search_subset_query
= ToUriQuery (search_subset
, null);
980 bonus_uris_query
= ToUriQuery (bonus_uris
, null);
982 if (query
.MimeTypes
.Count
> 0) {
983 mime_type_query
= new LNS
.BooleanQuery ();
984 foreach (string mime_type
in query
.MimeTypes
) {
985 Term t
= new Term ("MimeType", mime_type
);
986 LNS
.Query q
= new LNS
.TermQuery (t
);
987 mime_type_query
.Add (q
, false, false);
991 if (query
.HasHitTypes
) {
992 hit_type_query
= new LNS
.BooleanQuery ();
993 foreach (string hit_type
in query
.HitTypes
) {
994 Term t
= new Term ("Type", hit_type
);
995 LNS
.Query q
= new LNS
.TermQuery (t
);
996 hit_type_query
.Add (q
, false, false);
1001 // Now we combine the various parts into one big query.
1004 LNS
.BooleanQuery total_query
= new LNS
.BooleanQuery ();
1006 // If we have hit types or mime types, those must be matched
1007 if (mime_type_query
!= null)
1008 total_query
.Add (mime_type_query
, true, false);
1009 if (hit_type_query
!= null)
1010 total_query
.Add (hit_type_query
, true, false);
1012 // We also must match the "content query":
1013 // (body_query OR bonus_uris_query) AND search_subset_query
1015 LNS
.Query content_query
= null;
1017 if (body_query
!= null && bonus_uris_query
!= null) {
1018 LNS
.BooleanQuery q
= new LNS
.BooleanQuery ();
1019 q
.Add (body_query
, false, false);
1020 q
.Add (bonus_uris_query
, false, false);
1022 } else if (body_query
!= null) {
1023 content_query
= body_query
;
1024 } else if (bonus_uris_query
!= null) {
1025 content_query
= bonus_uris_query
;
1028 if (content_query
!= null && search_subset_query
!= null) {
1029 LNS
.BooleanQuery q
= new LNS
.BooleanQuery ();
1030 q
.Add (content_query
, true, false);
1031 q
.Add (search_subset_query
, true, false);
1033 } else if (search_subset_query
!= null) {
1034 content_query
= search_subset_query
;
1037 if (content_query
!= null)
1038 total_query
.Add (content_query
, true, false);
1043 static private Uri
UriFromLuceneDoc (Document doc
)
1045 string uri
= doc
.Get ("Uri");
1047 throw new Exception ("Got document from Lucene w/o a URI!");
1048 return UriFu
.UriStringToUri (uri
);
1051 static private void FromLuceneDocToVersioned (Document doc
, Versioned versioned
)
1055 str
= doc
.Get ("Timestamp");
1057 versioned
.Timestamp
= StringFu
.StringToDateTime (str
);
1060 private Hit
FromLuceneDocToHit (Document doc
, int id
, double score
)
1062 Hit hit
= new Hit ();
1068 FromLuceneDocToVersioned (doc
, hit
);
1070 hit
.Uri
= UriFromLuceneDoc (doc
);
1072 str
= doc
.Get ("Type");
1074 throw new Exception ("Got hit from Lucene w/o a Type!");
1077 str
= doc
.Get ("ParentUri");
1079 hit
.ParentUri
= UriFu
.UriStringToUri (str
);
1081 hit
.MimeType
= doc
.Get ("MimeType");
1083 hit
.Source
= "lucene";
1084 hit
.ScoreRaw
= score
;
1086 foreach (Field ff
in doc
.Fields ()) {
1087 Property prop
= FieldToProperty (ff
);
1089 hit
.AddProperty (prop
);
1096 /////////////////////////////////////////////////////
1099 // A common, shared analyzer
1102 private class BeagleNoiseFilter
: TokenFilter
{
1104 static int total_count
= 0;
1105 static int noise_count
= 0;
1107 TokenStream token_stream
;
1109 public BeagleNoiseFilter (TokenStream input
) : base (input
)
1111 token_stream
= input
;
1114 // FIXME: we should add some heuristics that are stricter
1115 // but explicitly try to avoid filtering out dates,
1116 // phone numbers, etc.
1117 private static bool IsNoise (string text
)
1119 // Anything really long is almost certainly noise.
1120 if (text
.Length
> 30)
1123 // Look at how often we switch between numbers and letters.
1125 // <letter> <digit> 1
1126 // <digit> <letter> 1
1127 // <x> <punct>+ <x> 1
1128 // <x> <punct>+ <y> 2
1129 const int transitions_cutoff
= 4;
1130 int last_type
= -1, last_non_punct_type
= -1, first_type
= -1;
1131 bool has_letter
= false, has_digit
= false, has_punctuation
= false;
1132 int transitions
= 0;
1133 for (int i
= 0; i
< text
.Length
&& transitions
< transitions_cutoff
; ++i
) {
1136 if (Char
.IsLetter (c
)) {
1139 } else if (Char
.IsDigit (c
)) {
1142 } else if (Char
.IsPunctuation (c
)) {
1144 has_punctuation
= true;
1149 if (type
!= last_type
) {
1150 if (last_type
== 3) {
1151 if (type
!= last_non_punct_type
)
1158 if (first_type
== -1)
1163 last_non_punct_type
= type
;
1167 // If we make too many transitions, it must be noise.
1168 if (transitions
>= transitions_cutoff
)
1171 // If we consist of nothing but digits and punctuation, treat it
1172 // as noise if it is too long.
1173 if (transitions
== 1 && first_type
!= 1 && text
.Length
> 10)
1176 // We are very suspicious of long things that make lots of
1178 if (transitions
> 3 && text
.Length
> 10)
1181 // Beware of anything long that contains a little of everything.
1182 if (has_letter
&& has_digit
&& has_punctuation
&& text
.Length
> 10)
1185 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
1190 public override Lucene
.Net
.Analysis
.Token
Next ()
1192 Lucene
.Net
.Analysis
.Token token
;
1193 while ( (token
= token_stream
.Next ()) != null) {
1195 if (total_count
> 0 && total_count
% 5000 == 0)
1196 Logger
.Log
.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
1197 noise_count
, total_count
, 100.0 * noise_count
/ total_count
);
1200 if (IsNoise (token
.TermText ())) {
1210 // This is just a standard analyzer combined with the Porter stemmer.
1211 // FIXME: This assumes everything being indexed is in English!
1212 private class BeagleAnalyzer
: StandardAnalyzer
{
1213 public override TokenStream
TokenStream (String fieldName
, TextReader reader
)
1215 TokenStream outstream
= base.TokenStream (fieldName
, reader
);
1216 if (fieldName
== "Text" || fieldName
== "HotText")
1217 outstream
= new BeagleNoiseFilter (outstream
);
1218 outstream
= new PorterStemFilter (outstream
);
1223 private static Analyzer theAnalyzer
;
1225 private static Analyzer Analyzer
{
1227 if (theAnalyzer
== null)
1228 theAnalyzer
= new BeagleAnalyzer ();
1233 /////////////////////////////////////////////////////
1236 // Access to the Stemmer
1239 static PorterStemmer stemmer
= new PorterStemmer ();
1241 static public string Stem (string str
)
1243 return stemmer
.Stem (str
);
1246 /////////////////////////////////////////////////////
1248 public static bool IsStopWord (string stemmed_word
)
1250 return ArrayFu
.IndexOfString (StopAnalyzer
.ENGLISH_STOP_WORDS
, stemmed_word
) != -1;
1253 /////////////////////////////////////////////////////
1256 // Helpful little utility functions
1259 static private String
RevisionToString (long rev
)
1261 return Convert
.ToString (rev
);
1264 static private long StringToRevision (String str
)
1266 return Convert
.ToInt64 (str
);
1269 /////////////////////////////////////////////////////
1271 public void Merge (string merge_dir
)
1273 string merge_index_dir
= Path
.Combine (merge_dir
, "Index");
1274 string merge_locks_dir
= Path
.Combine (merge_dir
, "Locks");
1276 if (!Directory
.Exists (merge_index_dir
) || !Directory
.Exists (merge_locks_dir
)) {
1277 throw new Exception ("Index does not exists");
1280 // FIXME: Error recovery
1282 Lucene
.Net
.Store
.FSDirectory store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (merge_index_dir
, merge_locks_dir
, false);
1283 Lucene
.Net
.Store
.Directory
[] stores
= {store}
;
1285 IndexWriter writer
= new IndexWriter (Store
, null, false);
1286 writer
.AddIndexes (stores
);
1290 /////////////////////////////////////////////////////
1292 // Expose some information for debugging and analytical purposes.
1294 public void WriteIndexTermFrequencies (TextWriter writer
)
1296 IndexReader reader
= IndexReader
.Open (Store
);
1297 TermEnum term_enum
= reader
.Terms ();
1301 while (term_enum
.Next ()) {
1302 term
= term_enum
.Term ();
1303 int freq
= term_enum
.DocFreq ();
1304 writer
.WriteLine ("{0} {1} {2}", term
.Field (), freq
, term
.Text ());