4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
33 using System
.Collections
;
34 using System
.Diagnostics
;
35 using System
.Globalization
;
38 using System
.Threading
;
40 using System
.Xml
.Serialization
;
42 using Lucene
.Net
.Analysis
;
43 using Lucene
.Net
.Analysis
.Standard
;
44 using Lucene
.Net
.Documents
;
45 using Lucene
.Net
.Index
;
46 using Lucene
.Net
.QueryParsers
;
47 using LNS
= Lucene
.Net
.Search
;
51 namespace Beagle
.Daemon
{
53 public class LuceneDriver
: IIndexer
{
55 public delegate bool UriFilter (Uri uri
);
56 public delegate Uri
UriRemapper (Uri uri
);
57 public delegate double RelevancyMultiplier (Hit hit
);
59 public event IIndexerChangedHandler ChangedEvent
;
61 /////////////////////////////////////////////////////
64 // 2: Changed format of timestamp strings
65 // 3: Schema changed to be more Dashboard-Match-like
66 // 4: Schema changed for files to include _Directory property
67 // 5: Changed analyzer to support stemming. Bumped version # to
68 // force everyone to re-index.
69 // 6: lots of schema changes as part of the general refactoring
70 // 7: incremented to force a re-index after our upgrade to lucene 1.4
71 // (in theory the file formats are compatible, we are seeing 'term
72 // out of order' exceptions in some cases)
73 // 8: another forced re-index, this time because of massive changes
74 // in the file system backend (it would be nice to have per-backend
75 // versioning so that we didn't have to purge all indexes just
76 // because one changed)
77 private const int VERSION
= 8;
79 private string top_dir
;
80 private Hashtable pending_by_uri
= UriFu
.NewHashtable ();
81 private int pending_adds
= 0;
82 private int pending_removals
= 0;
83 private int cycles_since_last_optimization
= 0;
84 private bool optimizing
= false;
85 private int last_item_count
= -1;
87 public LuceneDriver (string index_name
)
92 public string IndexDirectory
{
93 get { return top_dir; }
96 /////////////////////////////////////////////////////
102 private Lucene
.Net
.Store
.Directory ourStore
= null;
103 private string ourStorePath
= null;
106 public Lucene
.Net
.Store
.Directory Store
{
107 get { return ourStore; }
110 public string StorePath
{
111 get { return ourStorePath; }
114 /////////////////////////////////////////////////////
117 private void Setup (string index_name
)
119 top_dir
= Path
.Combine (PathFinder
.StorageDir
, index_name
);
121 string versionFile
= Path
.Combine (top_dir
, "version");
122 string fingerprintFile
= Path
.Combine (top_dir
, "fingerprint");
123 string lockDir
= Path
.Combine (top_dir
, "Locks");
124 string indexDir
= Path
.Combine (top_dir
, "Index");
125 string indexTestFile
= Path
.Combine (indexDir
, "segments");
127 bool versionExists
= File
.Exists (versionFile
);
128 bool fingerprintExists
= File
.Exists (fingerprintFile
);
129 bool indexExists
= File
.Exists (indexTestFile
);
131 // Check the index's version number. If it is wrong,
132 // declare the index non-existent.
133 if (versionExists
&& indexExists
) {
134 StreamReader sr
= new StreamReader (versionFile
);
135 string versionStr
= sr
.ReadLine ();
138 if (versionStr
!= Convert
.ToString (VERSION
))
142 // If there is no fingerprint file, declare the index
144 if (indexExists
&& ! fingerprintExists
)
147 // If the index seems to exist but contains dangling locks,
148 // declare the index non-existent.
150 DirectoryInfo lockDirInfo
= new DirectoryInfo (lockDir
);
151 if (! lockDirInfo
.Exists
)
154 foreach (FileInfo info
in lockDirInfo
.GetFiles ()) {
155 if (info
.Name
.IndexOf (".lock") != -1) {
161 log
.Debug ("Found dangling locks in {0}", lockDir
);
166 // Read in the fingerprint
167 StreamReader sr
= new StreamReader (fingerprintFile
);
168 fingerprint
= sr
.ReadLine ();
172 // Purge and rebuild the index's directory
175 if (Directory
.Exists (top_dir
)) {
176 log
.Debug ("Purging {0}", top_dir
);
177 Directory
.Delete (top_dir
, true);
180 // Create all directories.
181 Directory
.CreateDirectory (top_dir
);
182 Directory
.CreateDirectory (lockDir
);
183 Directory
.CreateDirectory (indexDir
);
187 // Generate a fingerprint and write it out
188 fingerprint
= Guid
.NewGuid ().ToString ();
189 sw
= new StreamWriter (fingerprintFile
, false);
190 sw
.WriteLine (fingerprint
);
193 // Write out our version information
194 sw
= new StreamWriter (versionFile
, false);
195 sw
.WriteLine ("{0}", VERSION
);
199 Lucene
.Net
.Store
.FSDirectory store
;
200 store
= Lucene
.Net
.Store
.FSDirectory
.GetDirectory (indexDir
, lockDir
, false);
202 ourStorePath
= indexDir
;
207 // This creates the index if it doesn't exist
208 IndexWriter writer
= new IndexWriter (Store
, null, true);
213 /////////////////////////////////////////////////////
219 private static Logger log
= Logger
.Get ("lucene");
221 private static Logger Log
{
225 /////////////////////////////////////////////////////
228 // The Index's Fingerprint and up-to-date checking
231 private string fingerprint
= null;
233 public string Fingerprint
{
234 get { return fingerprint; }
237 /////////////////////////////////////////////////////
240 // Public Indexing API
243 static object [] empty_collection
= new object [0];
245 public void Add (Indexable indexable
)
247 Uri uri
= indexable
.Uri
;
249 lock (pending_by_uri
) {
250 if (pending_by_uri
.Contains (uri
) && pending_by_uri
[uri
] == null)
252 pending_by_uri
[uri
] = indexable
;
257 public void Remove (Uri uri
)
259 lock (pending_by_uri
) {
260 if (pending_by_uri
[uri
] != null)
262 pending_by_uri
[uri
] = null;
267 public void Rename (Uri old_uri
, Uri new_uri
)
269 Logger
.Log
.Error ("**** LuceneDriver does not support Rename!");
270 Logger
.Log
.Error ("**** old_uri={0}", old_uri
);
271 Logger
.Log
.Error ("**** new_uri={0}", new_uri
);
274 public int PendingAdds
{
275 get { return pending_adds; }
278 public int PendingRemovals
{
279 get { return pending_removals; }
284 ArrayList pending_uris
;
285 ArrayList pending_indexables
;
287 ArrayList added_uris
;
288 ArrayList removed_uris
;
290 lock (pending_by_uri
) {
292 if (pending_by_uri
.Count
== 0)
295 pending_uris
= new ArrayList ();
296 pending_indexables
= new ArrayList ();
297 added_uris
= new ArrayList ();
298 removed_uris
= new ArrayList ();
300 // Move our indexables and remove requests out of the
301 // hash and into local data structures.
302 foreach (DictionaryEntry entry
in pending_by_uri
) {
303 Uri uri
= (Uri
) entry
.Key
;
304 Indexable indexable
= (Indexable
) entry
.Value
;
306 pending_uris
.Add (uri
);
307 if (indexable
!= null)
308 pending_indexables
.Add (indexable
);
310 if (indexable
!= null)
311 added_uris
.Add (uri
);
313 removed_uris
.Add (uri
);
317 pending_removals
= 0;
318 pending_by_uri
.Clear ();
322 int removal_count
= 0;
324 Log
.Debug ("Flushing...");
326 Stopwatch watch
= new Stopwatch ();
328 // Step #1: Delete all items with the same URIs
329 // as our pending items from the index.
331 IndexReader reader
= IndexReader
.Open (Store
);
332 foreach (Uri uri
in pending_uris
) {
333 log
.Debug ("- {0}", uri
);
334 Term term
= new Term ("Uri", uri
.ToString ());
335 reader
.Delete (term
);
338 last_item_count
= reader
.NumDocs ();
341 Log
.Debug ("Lucene Delete: {0} {1} {2}", watch
, pending_uris
.Count
,
342 pending_uris
.Count
/ watch
.ElapsedTime
);
345 // Step #2: Write out the pending adds
347 IndexWriter writer
= null;
348 foreach (Indexable indexable
in pending_indexables
) {
350 Log
.Debug ("+ {0}", indexable
.DisplayUri
);
354 doc
= ToLuceneDocument (indexable
);
355 } catch (Exception e
) {
356 Log
.Error ("Unable to convert {0} (type={1}) to a lucene document",
357 indexable
.Uri
, indexable
.Type
);
363 writer
= new IndexWriter (Store
, Analyzer
, false);
364 writer
.AddDocument (doc
);
372 Log
.Debug ("Lucene Add: {0} {1} {2}", watch
, pending_indexables
.Count
,
373 pending_indexables
.Count
/ watch
.ElapsedTime
);
376 // Step #3: Fire off an event telling what we just did.
377 if (ChangedEvent
!= null) {
378 ChangedEvent (this, added_uris
, removed_uris
, empty_collection
);
381 lock (pending_by_uri
)
382 cycles_since_last_optimization
++;
388 private bool NeedsOptimize
{
390 // FIXME: 19 is a totally arbitrary number.
391 return cycles_since_last_optimization
> 19;
395 private void Optimize ()
397 // If nothing has happened since our last optimization,
399 // If this index is already being optimized, don't
400 // optimize it again.
401 lock (pending_by_uri
) {
402 if (optimizing
|| cycles_since_last_optimization
== 0)
407 Log
.Debug ("Optimizing {0}...", StorePath
);
409 Stopwatch watch
= new Stopwatch ();
412 IndexWriter writer
= new IndexWriter (Store
, null, false);
418 Log
.Debug ("Optimization time for {0}: {1}", StorePath
, watch
);
420 lock (pending_by_uri
) {
422 cycles_since_last_optimization
= 0;
426 /////////////////////////////////////////////////////
428 // Returns the lowest matching score before the results are
430 public void DoQuery (QueryBody body
,
432 ICollection search_subset
, // should be internal uris
433 ICollection bonus_uris
, // should be internal uris
434 UriFilter uri_filter
,
435 UriRemapper uri_remapper
, // map to external uris
436 RelevancyMultiplier relevancy_multiplier
)
441 Stopwatch sw
= new Stopwatch ();
443 IndexReader reader
= IndexReader
.Open (Store
);
445 LNS
.Searcher searcher
= new LNS
.IndexSearcher (reader
);
446 LNS
.Query query
= ToLuceneQuery (body
, search_subset
, bonus_uris
);
448 LNS
.Hits hits
= searcher
.Search (query
);
451 t_lucene
= sw
.ElapsedTime
;
453 //////////////////////////////////////
458 int n_hits
= hits
.Length ();
462 for (int i
= 0; i
< n_hits
; ++i
) {
463 Document doc
= hits
.Doc (i
);
465 if (uri_filter
!= null) {
466 Uri uri
= UriFromLuceneDoc (doc
);
467 if (! uri_filter (uri
))
471 double score
= (double) hits
.Score (i
);
473 if (result
.WillReject (score
)) {
474 log
.Debug ("Terminating DoQuery at {0} of {1} (score={2})", i
, n_hits
, score
);
478 Hit hit
= FromLuceneDocToHit (doc
, hits
.Id (i
), score
);
479 if (uri_remapper
!= null)
480 hit
.Uri
= uri_remapper (hit
.Uri
);
482 if (relevancy_multiplier
!= null) {
483 double m
= relevancy_multiplier (hit
);
484 hit
.ScoreMultiplier
= (float) m
;
491 t_assembly
= sw
.ElapsedTime
;
493 //////////////////////////////////////
495 // The call to searcher.Close () also closes the IndexReader.
498 log
.Debug ("{0}: n_hits={1} lucene={2:0.00}s assembly={3:0.00}s",
499 StorePath
, n_hits
, t_lucene
, t_assembly
);
502 // FIXME: This should support Uri filtering, Uri remapping, etc.
503 public ICollection
DoQueryByUri (ICollection list_of_uris
)
505 LNS
.BooleanQuery uri_query
= new LNS
.BooleanQuery ();
506 LNS
.Searcher searcher
;
507 LNS
.Hits lucene_hits
;
508 ArrayList all_hits
= new ArrayList ();
510 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
511 int clause_count
= 0;
513 foreach (Uri uri
in list_of_uris
) {
514 Term term
= new Term ("Uri", uri
.ToString ());
515 LNS
.Query term_query
= new LNS
.TermQuery (term
);
516 uri_query
.Add (term_query
, false, false);
519 if (clause_count
== max_clauses
) {
520 searcher
= new LNS
.IndexSearcher (Store
);
521 lucene_hits
= searcher
.Search (uri_query
);
522 int n_hits
= lucene_hits
.Length ();
524 for (int i
= 0; i
< n_hits
; ++i
) {
525 Hit hit
= FromLuceneDocToHit (lucene_hits
.Doc (i
),
527 lucene_hits
.Score (i
));
532 uri_query
= new LNS
.BooleanQuery ();
537 if (clause_count
> 0) {
538 searcher
= new LNS
.IndexSearcher (Store
);
539 lucene_hits
= searcher
.Search (uri_query
);
540 int n_hits
= lucene_hits
.Length ();
542 for (int i
= 0; i
< n_hits
; ++i
) {
543 Hit hit
= FromLuceneDocToHit (lucene_hits
.Doc (i
),
545 lucene_hits
.Score (i
));
555 public ICollection
DoQueryByUri (Uri uri
)
557 return DoQueryByUri (new Uri
[1] { uri }
);
560 // We cache the number of documents in the index when readers are
561 // available, so calls to GetItemCount will return immediately
562 // if the driver has been flushed or queried.
563 public int GetItemCount ()
565 if (last_item_count
< 0) {
566 IndexReader reader
= IndexReader
.Open (Store
);
567 last_item_count
= reader
.NumDocs ();
570 return last_item_count
;
574 ///////////////////////////////////////////////////////////////////////////////////////
577 // Code to map to/from Lucene data types
580 private Document
ToLuceneDocument (Indexable indexable
)
584 Document doc
= new Document ();
589 // First we add the Indexable's 'canonical' properties
592 f
= Field
.Keyword ("Uri", UriFu
.UriToSerializableString (indexable
.Uri
));
595 f
= Field
.Keyword ("Type", indexable
.Type
);
598 if (indexable
.MimeType
!= null) {
599 f
= Field
.Keyword ("MimeType", indexable
.MimeType
);
603 if (indexable
.ValidTimestamp
) {
604 str
= StringFu
.DateTimeToString (indexable
.Timestamp
);
605 f
= Field
.Keyword ("Timestamp", str
);
609 if (indexable
.ValidRevision
) {
610 f
= Field
.UnIndexed ("Revision",
611 RevisionToString (indexable
.Revision
));
615 reader
= indexable
.GetTextReader ();
616 if (reader
!= null) {
617 f
= Field
.Text ("Text", reader
);
621 reader
= indexable
.GetHotTextReader ();
622 if (reader
!= null) {
623 f
= Field
.Text ("HotText", reader
);
627 f
= Field
.UnStored ("PropertiesText",
628 indexable
.TextPropertiesAsString
);
631 // FIXME: We shouldn't apply stemming, etc. when dealing
633 f
= Field
.UnStored ("PropertiesKeyword",
634 indexable
.KeywordPropertiesAsString
);
637 // FIXME: We need to deal with duplicate properties in some
639 foreach (Property prop
in indexable
.Properties
) {
640 if (prop
.Value
!= null) {
641 f
= Field
.Keyword (ToLucenePropertyKey (prop
.Key
),
650 static public LNS
.Query
ToUriQuery (ICollection list_of_uris
, UriRemapper remapper
)
652 if (list_of_uris
== null || list_of_uris
.Count
== 0)
655 LNS
.BooleanQuery query
= new LNS
.BooleanQuery ();
656 int max_clauses
= LNS
.BooleanQuery
.GetMaxClauseCount ();
657 int clause_count
= 0;
659 foreach (Uri original_uri
in list_of_uris
) {
660 Uri uri
= original_uri
;
661 if (remapper
!= null)
662 uri
= remapper (uri
);
663 Term term
= new Term ("Uri", uri
.ToString ()); // FIXME: Do we need some UriFu here?
664 LNS
.Query term_query
= new LNS
.TermQuery (term
);
665 query
.Add (term_query
, false, false);
667 // If we have to many clases, nest the queries
668 if (clause_count
== max_clauses
) {
669 LNS
.BooleanQuery new_query
= new LNS
.BooleanQuery ();
670 new_query
.Add (query
, false, false);
679 static public LNS
.Query
ToUriQuery (ICollection list_of_uris
)
681 return ToUriQuery (list_of_uris
, null);
684 private LNS
.Query
ToCoreLuceneQuery (QueryBody body
, string field
)
686 LNS
.BooleanQuery luceneQuery
= null;
687 foreach (string text_orig
in body
.Text
) {
688 string text
= text_orig
;
690 if (text
== null || text
== "")
693 bool minus_sign
= false;
694 if (text
[0] == '-') {
695 text
= text
.Substring (1);
699 // Use the analyzer to extract the query's tokens.
700 // This code is taken from Lucene's query parser.
701 // We use the standard Analyzer.
702 TokenStream source
= LuceneDriver
.Analyzer
.TokenStream (field
, new StringReader (text
));
703 ArrayList tokens
= new ArrayList ();
706 Lucene
.Net
.Analysis
.Token t
;
709 } catch (IOException
) {
714 tokens
.Add (t
.TermText ());
718 } catch (IOException
) {
723 if (tokens
.Count
== 1) {
724 Term t
= new Term (field
, (string) tokens
[0]);
725 q
= new LNS
.TermQuery (t
);
726 } else if (tokens
.Count
> 1) {
727 q
= new LNS
.PhraseQuery ();
728 foreach (string tokenStr
in tokens
) {
729 Term t
= new Term (field
, tokenStr
);
730 ((LNS
.PhraseQuery
) q
).Add (t
);
735 if (luceneQuery
== null)
736 luceneQuery
= new LNS
.BooleanQuery ();
737 luceneQuery
.Add (q
, !minus_sign
, minus_sign
);
744 // search_subset limits the score of our search to that set of Uris
745 // bonus_uris are always matched by the query
746 private LNS
.Query
ToLuceneQuery (QueryBody body
,
747 ICollection search_subset
,
748 ICollection bonus_uris
)
750 LNS
.BooleanQuery body_query
= null;
751 LNS
.Query search_subset_query
= null;
752 LNS
.Query bonus_uris_query
= null;
753 LNS
.BooleanQuery mime_type_query
= null;
754 LNS
.BooleanQuery hit_type_query
= null;
756 if (body
.Text
.Count
> 0) {
758 body_query
= new LNS
.BooleanQuery ();
762 q
= ToCoreLuceneQuery (body
, "PropertiesText");
765 body_query
.Add (q
, false, false);
768 q
= ToCoreLuceneQuery (body
, "PropertiesKeyword");
771 body_query
.Add (q
, false, false);
774 q
= ToCoreLuceneQuery (body
, "HotText");
777 body_query
.Add (q
, false, false);
780 q
= ToCoreLuceneQuery (body
, "Text");
782 body_query
.Add (q
, false, false);
786 search_subset_query
= ToUriQuery (search_subset
, null);
788 bonus_uris_query
= ToUriQuery (bonus_uris
, null);
790 if (body
.MimeTypes
.Count
> 0) {
791 mime_type_query
= new LNS
.BooleanQuery ();
792 foreach (string mime_type
in body
.MimeTypes
) {
793 Term t
= new Term ("MimeType", mime_type
);
794 LNS
.Query q
= new LNS
.TermQuery (t
);
795 mime_type_query
.Add (q
, false, false);
799 if (body
.HasHitTypes
) {
800 hit_type_query
= new LNS
.BooleanQuery ();
801 foreach (string hit_type
in body
.HitTypes
) {
802 Term t
= new Term ("Type", hit_type
);
803 LNS
.Query q
= new LNS
.TermQuery (t
);
804 hit_type_query
.Add (q
, false, false);
809 // Now we combine the various parts into one big query.
812 LNS
.BooleanQuery total_query
= new LNS
.BooleanQuery ();
814 // If we have hit types or mime types, those must be matched
815 if (mime_type_query
!= null)
816 total_query
.Add (mime_type_query
, true, false);
817 if (hit_type_query
!= null)
818 total_query
.Add (hit_type_query
, true, false);
820 // We also must match the "content query":
821 // (body_query OR bonus_uris_query) AND search_subset_query
823 LNS
.Query content_query
= null;
825 if (body_query
!= null && bonus_uris_query
!= null) {
826 LNS
.BooleanQuery q
= new LNS
.BooleanQuery ();
827 q
.Add (body_query
, false, false);
828 q
.Add (bonus_uris_query
, false, false);
830 } else if (body_query
!= null) {
831 content_query
= body_query
;
832 } else if (bonus_uris_query
!= null) {
833 content_query
= bonus_uris_query
;
836 if (content_query
!= null && search_subset_query
!= null) {
837 LNS
.BooleanQuery q
= new LNS
.BooleanQuery ();
838 q
.Add (content_query
, true, false);
839 q
.Add (search_subset_query
, true, false);
841 } else if (search_subset_query
!= null) {
842 content_query
= search_subset_query
;
845 if (content_query
!= null)
846 total_query
.Add (content_query
, true, false);
851 static private Uri
UriFromLuceneDoc (Document doc
)
853 string uri
= doc
.Get ("Uri");
855 throw new Exception ("Got document from Lucene w/o a URI!");
856 return UriFu
.UriStringToUri (uri
);
859 static private void FromLuceneDocToVersioned (Document doc
, Versioned versioned
)
863 str
= doc
.Get ("Timestamp");
865 versioned
.Timestamp
= StringFu
.StringToDateTime (str
);
867 str
= doc
.Get ("Revision");
869 versioned
.Revision
= StringToRevision (str
);
873 private Hit
FromLuceneDocToHit (Document doc
, int id
, double score
)
875 Hit hit
= new Hit ();
881 FromLuceneDocToVersioned (doc
, hit
);
883 hit
.Uri
= UriFromLuceneDoc (doc
);
885 str
= doc
.Get ("Type");
887 throw new Exception ("Got hit from Lucene w/o a Type!");
890 hit
.MimeType
= doc
.Get ("MimeType");
892 hit
.Source
= "lucene";
893 hit
.ScoreRaw
= score
;
895 foreach (Field ff
in doc
.Fields ()) {
896 string key
= FromLucenePropertyKey (ff
.Name ());
898 hit
[key
] = ff
.StringValue ();
905 /////////////////////////////////////////////////////
908 // A common, shared analyzer
911 private class BeagleNoiseFilter
: TokenFilter
{
913 static int total_count
= 0;
914 static int noise_count
= 0;
916 TokenStream token_stream
;
918 public BeagleNoiseFilter (TokenStream input
) : base (input
)
920 token_stream
= input
;
923 // FIXME: we should add some heuristics that are stricter
924 // but explicitly try to avoid filtering out dates,
925 // phone numbers, etc.
926 private static bool IsNoise (string text
)
928 // Anything really long is almost certainly noise.
929 if (text
.Length
> 30)
932 // Look at how often we switch between numbers and letters.
934 // <letter> <digit> 1
935 // <digit> <letter> 1
936 // <x> <punct>+ <x> 1
937 // <x> <punct>+ <y> 2
938 const int transitions_cutoff
= 4;
939 int last_type
= -1, last_non_punct_type
= -1, first_type
= -1;
940 bool has_letter
= false, has_digit
= false, has_punctuation
= false;
942 for (int i
= 0; i
< text
.Length
&& transitions
< transitions_cutoff
; ++i
) {
945 if (Char
.IsLetter (c
)) {
948 } else if (Char
.IsDigit (c
)) {
951 } else if (Char
.IsPunctuation (c
)) {
953 has_punctuation
= true;
958 if (type
!= last_type
) {
959 if (last_type
== 3) {
960 if (type
!= last_non_punct_type
)
967 if (first_type
== -1)
972 last_non_punct_type
= type
;
976 // If we make too many transitions, it must be noise.
977 if (transitions
>= transitions_cutoff
)
980 // If we consist of nothing but digits and punctuation, treat it
981 // as noise if it is too long.
982 if (transitions
== 1 && first_type
!= 1 && text
.Length
> 10)
985 // We are very suspicious of long things that make lots of
987 if (transitions
> 3 && text
.Length
> 10)
990 // Beware of anything long that contains a little of everything.
991 if (has_letter
&& has_digit
&& has_punctuation
&& text
.Length
> 10)
994 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
999 public override Lucene
.Net
.Analysis
.Token
Next ()
1001 Lucene
.Net
.Analysis
.Token token
;
1002 while ( (token
= token_stream
.Next ()) != null) {
1004 if (total_count
> 0 && total_count
% 5000 == 0)
1005 Logger
.Log
.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
1006 noise_count
, total_count
, 100.0 * noise_count
/ total_count
);
1009 if (IsNoise (token
.TermText ())) {
1019 // This is just a standard analyzer combined with the Porter stemmer.
1020 // FIXME: This assumes everything being indexed is in English!
1021 private class BeagleAnalyzer
: StandardAnalyzer
{
1022 public override TokenStream
TokenStream (String fieldName
, TextReader reader
)
1024 TokenStream outstream
= base.TokenStream (fieldName
, reader
);
1025 if (fieldName
== "Text" || fieldName
== "HotText")
1026 outstream
= new BeagleNoiseFilter (outstream
);
1027 outstream
= new PorterStemFilter (outstream
);
1032 private static Analyzer theAnalyzer
;
1034 private static Analyzer Analyzer
{
1036 if (theAnalyzer
== null)
1037 theAnalyzer
= new BeagleAnalyzer ();
1042 /////////////////////////////////////////////////////
1045 // Access to the Stemmer
1048 static PorterStemmer stemmer
= new PorterStemmer ();
1050 static public string Stem (string str
)
1052 return stemmer
.Stem (str
);
1055 /////////////////////////////////////////////////////
1058 // Helpful little utility functions
1061 static private String
RevisionToString (long rev
)
1063 return Convert
.ToString (rev
);
1066 static private long StringToRevision (String str
)
1068 return Convert
.ToInt64 (str
);
1071 const string propPrefix
= "prop:";
1073 private string ToLucenePropertyKey (string key
)
1075 return propPrefix
+ key
;
1078 private string FromLucenePropertyKey (string key
)
1080 if (key
.StartsWith (propPrefix
))
1081 return key
.Substring (propPrefix
.Length
);
1085 /////////////////////////////////////////////////////
1087 // Expose some information for debugging and analytical purposes.
1089 public void WriteIndexTermFrequencies (TextWriter writer
)
1091 IndexReader reader
= IndexReader
.Open (Store
);
1092 TermEnum term_enum
= reader
.Terms ();
1095 while (term_enum
.Next ()) {
1096 term
= term_enum
.Term ();
1097 int freq
= term_enum
.DocFreq ();
1098 writer
.WriteLine ("{0} {1} {2}", term
.Field (), freq
, term
.Text ());