2005-05-14 Gabor Kelemen <kelemeng@gnome.hu>
[beagle.git] / beagled / LuceneDriver.cs
blob1fde8c7ccbb5e6a63d12ca33dd7186316bf3c92f
1 //
2 // LuceneDriver.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneDriver : IIndexer {
55 public delegate bool UriFilter (Uri uri);
56 public delegate Uri UriRemapper (Uri uri);
57 public delegate double RelevancyMultiplier (Hit hit);
59 public event IIndexerChangedHandler ChangedEvent;
61 /////////////////////////////////////////////////////
63 // 1: Original
64 // 2: Changed format of timestamp strings
65 // 3: Schema changed to be more Dashboard-Match-like
66 // 4: Schema changed for files to include _Directory property
67 // 5: Changed analyzer to support stemming. Bumped version # to
68 // force everyone to re-index.
69 // 6: lots of schema changes as part of the general refactoring
70 // 7: incremented to force a re-index after our upgrade to lucene 1.4
71 // (in theory the file formats are compatible, we are seeing 'term
72 // out of order' exceptions in some cases)
73 // 8: another forced re-index, this time because of massive changes
74 // in the file system backend (it would be nice to have per-backend
75 // versioning so that we didn't have to purge all indexes just
76 // because one changed)
77 private const int VERSION = 8;
79 private string top_dir;
80 private Hashtable pending_by_uri = UriFu.NewHashtable ();
81 private int pending_adds = 0;
82 private int pending_removals = 0;
83 private int cycles_since_last_optimization = 0;
84 private bool optimizing = false;
85 private int last_item_count = -1;
87 public LuceneDriver (string index_name)
89 Setup (index_name);
92 public string IndexDirectory {
93 get { return top_dir; }
96 /////////////////////////////////////////////////////
99 // The Lucene Store
102 private Lucene.Net.Store.Directory ourStore = null;
103 private string ourStorePath = null;
106 public Lucene.Net.Store.Directory Store {
107 get { return ourStore; }
110 public string StorePath {
111 get { return ourStorePath; }
114 /////////////////////////////////////////////////////
117 private void Setup (string index_name)
119 top_dir = Path.Combine (PathFinder.StorageDir, index_name);
121 string versionFile = Path.Combine (top_dir, "version");
122 string fingerprintFile = Path.Combine (top_dir, "fingerprint");
123 string lockDir = Path.Combine (top_dir, "Locks");
124 string indexDir = Path.Combine (top_dir, "Index");
125 string indexTestFile = Path.Combine (indexDir, "segments");
127 bool versionExists = File.Exists (versionFile);
128 bool fingerprintExists = File.Exists (fingerprintFile);
129 bool indexExists = File.Exists (indexTestFile);
131 // Check the index's version number. If it is wrong,
132 // declare the index non-existent.
133 if (versionExists && indexExists) {
134 StreamReader sr = new StreamReader (versionFile);
135 string versionStr = sr.ReadLine ();
136 sr.Close ();
138 if (versionStr != Convert.ToString (VERSION))
139 indexExists = false;
142 // If there is no fingerprint file, declare the index
143 // non-existent.
144 if (indexExists && ! fingerprintExists)
145 indexExists = false;
147 // If the index seems to exist but contains dangling locks,
148 // declare the index non-existent.
149 if (indexExists) {
150 DirectoryInfo lockDirInfo = new DirectoryInfo (lockDir);
151 if (! lockDirInfo.Exists)
152 indexExists = false;
153 else {
154 foreach (FileInfo info in lockDirInfo.GetFiles ()) {
155 if (info.Name.IndexOf (".lock") != -1) {
156 indexExists = false;
157 break;
160 if (! indexExists)
161 log.Debug ("Found dangling locks in {0}", lockDir);
165 if (indexExists) {
166 // Read in the fingerprint
167 StreamReader sr = new StreamReader (fingerprintFile);
168 fingerprint = sr.ReadLine ();
169 sr.Close ();
171 } else {
172 // Purge and rebuild the index's directory
173 // structure.
175 if (Directory.Exists (top_dir)) {
176 log.Debug ("Purging {0}", top_dir);
177 Directory.Delete (top_dir, true);
180 // Create all directories.
181 Directory.CreateDirectory (top_dir);
182 Directory.CreateDirectory (lockDir);
183 Directory.CreateDirectory (indexDir);
185 StreamWriter sw;
187 // Generate a fingerprint and write it out
188 fingerprint = Guid.NewGuid ().ToString ();
189 sw = new StreamWriter (fingerprintFile, false);
190 sw.WriteLine (fingerprint);
191 sw.Close ();
193 // Write out our version information
194 sw = new StreamWriter (versionFile, false);
195 sw.WriteLine ("{0}", VERSION);
196 sw.Close ();
199 Lucene.Net.Store.FSDirectory store;
200 store = Lucene.Net.Store.FSDirectory.GetDirectory (indexDir, lockDir, false);
201 ourStore = store;
202 ourStorePath = indexDir;
204 //Store = store;
206 if (!indexExists) {
207 // This creates the index if it doesn't exist
208 IndexWriter writer = new IndexWriter (Store, null, true);
209 writer.Close ();
213 /////////////////////////////////////////////////////
216 // The log
219 private static Logger log = Logger.Get ("lucene");
221 private static Logger Log {
222 get { return log; }
225 /////////////////////////////////////////////////////
228 // The Index's Fingerprint and up-to-date checking
231 private string fingerprint = null;
233 public string Fingerprint {
234 get { return fingerprint; }
237 /////////////////////////////////////////////////////
240 // Public Indexing API
243 static object [] empty_collection = new object [0];
245 public void Add (Indexable indexable)
247 Uri uri = indexable.Uri;
249 lock (pending_by_uri) {
250 if (pending_by_uri.Contains (uri) && pending_by_uri [uri] == null)
251 --pending_removals;
252 pending_by_uri [uri] = indexable;
253 ++pending_adds;
257 public void Remove (Uri uri)
259 lock (pending_by_uri) {
260 if (pending_by_uri [uri] != null)
261 --pending_adds;
262 pending_by_uri [uri] = null;
263 ++pending_removals;
267 public void Rename (Uri old_uri, Uri new_uri)
269 Logger.Log.Error ("**** LuceneDriver does not support Rename!");
270 Logger.Log.Error ("**** old_uri={0}", old_uri);
271 Logger.Log.Error ("**** new_uri={0}", new_uri);
274 public int PendingAdds {
275 get { return pending_adds; }
278 public int PendingRemovals {
279 get { return pending_removals; }
282 public void Flush ()
284 ArrayList pending_uris;
285 ArrayList pending_indexables;
287 ArrayList added_uris;
288 ArrayList removed_uris;
290 lock (pending_by_uri) {
292 if (pending_by_uri.Count == 0)
293 return;
295 pending_uris = new ArrayList ();
296 pending_indexables = new ArrayList ();
297 added_uris = new ArrayList ();
298 removed_uris = new ArrayList ();
300 // Move our indexables and remove requests out of the
301 // hash and into local data structures.
302 foreach (DictionaryEntry entry in pending_by_uri) {
303 Uri uri = (Uri) entry.Key;
304 Indexable indexable = (Indexable) entry.Value;
306 pending_uris.Add (uri);
307 if (indexable != null)
308 pending_indexables.Add (indexable);
310 if (indexable != null)
311 added_uris.Add (uri);
312 else
313 removed_uris.Add (uri);
316 pending_adds = 0;
317 pending_removals = 0;
318 pending_by_uri.Clear ();
321 int add_count = 0;
322 int removal_count = 0;
324 Log.Debug ("Flushing...");
326 Stopwatch watch = new Stopwatch ();
328 // Step #1: Delete all items with the same URIs
329 // as our pending items from the index.
330 watch.Restart ();
331 IndexReader reader = IndexReader.Open (Store);
332 foreach (Uri uri in pending_uris) {
333 log.Debug ("- {0}", uri);
334 Term term = new Term ("Uri", uri.ToString ());
335 reader.Delete (term);
336 ++removal_count;
338 last_item_count = reader.NumDocs ();
339 reader.Close ();
340 watch.Stop ();
341 Log.Debug ("Lucene Delete: {0} {1} {2}", watch, pending_uris.Count,
342 pending_uris.Count / watch.ElapsedTime);
345 // Step #2: Write out the pending adds
346 watch.Restart ();
347 IndexWriter writer = null;
348 foreach (Indexable indexable in pending_indexables) {
350 Log.Debug ("+ {0}", indexable.DisplayUri);
352 Document doc = null;
353 try {
354 doc = ToLuceneDocument (indexable);
355 } catch (Exception e) {
356 Log.Error ("Unable to convert {0} (type={1}) to a lucene document",
357 indexable.Uri, indexable.Type);
358 Log.Error (e);
361 if (doc != null) {
362 if (writer == null)
363 writer = new IndexWriter (Store, Analyzer, false);
364 writer.AddDocument (doc);
365 ++last_item_count;
366 ++add_count;
369 if (writer != null)
370 writer.Close ();
371 watch.Stop ();
372 Log.Debug ("Lucene Add: {0} {1} {2}", watch, pending_indexables.Count,
373 pending_indexables.Count / watch.ElapsedTime);
376 // Step #3: Fire off an event telling what we just did.
377 if (ChangedEvent != null) {
378 ChangedEvent (this, added_uris, removed_uris, empty_collection);
381 lock (pending_by_uri)
382 cycles_since_last_optimization++;
384 if (NeedsOptimize)
385 Optimize ();
388 private bool NeedsOptimize {
389 get {
390 // FIXME: 19 is a totally arbitrary number.
391 return cycles_since_last_optimization > 19;
395 private void Optimize ()
397 // If nothing has happened since our last optimization,
398 // do dothing.
399 // If this index is already being optimized, don't
400 // optimize it again.
401 lock (pending_by_uri) {
402 if (optimizing || cycles_since_last_optimization == 0)
403 return;
404 optimizing = true;
407 Log.Debug ("Optimizing {0}...", StorePath);
409 Stopwatch watch = new Stopwatch ();
410 watch.Start ();
412 IndexWriter writer = new IndexWriter (Store, null, false);
413 writer.Optimize ();
414 writer.Close ();
416 watch.Stop ();
418 Log.Debug ("Optimization time for {0}: {1}", StorePath, watch);
420 lock (pending_by_uri) {
421 optimizing = false;
422 cycles_since_last_optimization = 0;
426 /////////////////////////////////////////////////////
428 // Returns the lowest matching score before the results are
429 // truncated.
430 public void DoQuery (QueryBody body,
431 IQueryResult result,
432 ICollection search_subset, // should be internal uris
433 ICollection bonus_uris, // should be internal uris
434 UriFilter uri_filter,
435 UriRemapper uri_remapper, // map to external uris
436 RelevancyMultiplier relevancy_multiplier)
438 double t_lucene;
439 double t_assembly;
441 Stopwatch sw = new Stopwatch ();
442 sw.Start ();
443 IndexReader reader = IndexReader.Open (Store);
445 LNS.Searcher searcher = new LNS.IndexSearcher (reader);
446 LNS.Query query = ToLuceneQuery (body, search_subset, bonus_uris);
448 LNS.Hits hits = searcher.Search (query);
449 sw.Stop ();
451 t_lucene = sw.ElapsedTime;
453 //////////////////////////////////////
455 sw.Reset ();
456 sw.Start ();
458 int n_hits = hits.Length ();
459 if (n_hits == 0)
460 return;
462 for (int i = 0; i < n_hits; ++i) {
463 Document doc = hits.Doc (i);
465 if (uri_filter != null) {
466 Uri uri = UriFromLuceneDoc (doc);
467 if (! uri_filter (uri))
468 continue;
471 double score = (double) hits.Score (i);
473 if (result.WillReject (score)) {
474 log.Debug ("Terminating DoQuery at {0} of {1} (score={2})", i, n_hits, score);
475 break;
478 Hit hit = FromLuceneDocToHit (doc, hits.Id (i), score);
479 if (uri_remapper != null)
480 hit.Uri = uri_remapper (hit.Uri);
482 if (relevancy_multiplier != null) {
483 double m = relevancy_multiplier (hit);
484 hit.ScoreMultiplier = (float) m;
486 result.Add (hit);
489 sw.Stop ();
491 t_assembly = sw.ElapsedTime;
493 //////////////////////////////////////
495 // The call to searcher.Close () also closes the IndexReader.
496 searcher.Close ();
498 log.Debug ("{0}: n_hits={1} lucene={2:0.00}s assembly={3:0.00}s",
499 StorePath, n_hits, t_lucene, t_assembly);
502 // FIXME: This should support Uri filtering, Uri remapping, etc.
503 public ICollection DoQueryByUri (ICollection list_of_uris)
505 LNS.BooleanQuery uri_query = new LNS.BooleanQuery ();
506 LNS.Searcher searcher;
507 LNS.Hits lucene_hits;
508 ArrayList all_hits = new ArrayList ();
510 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
511 int clause_count = 0;
513 foreach (Uri uri in list_of_uris) {
514 Term term = new Term ("Uri", uri.ToString ());
515 LNS.Query term_query = new LNS.TermQuery (term);
516 uri_query.Add (term_query, false, false);
517 ++clause_count;
519 if (clause_count == max_clauses) {
520 searcher = new LNS.IndexSearcher (Store);
521 lucene_hits = searcher.Search (uri_query);
522 int n_hits = lucene_hits.Length ();
524 for (int i = 0; i < n_hits; ++i) {
525 Hit hit = FromLuceneDocToHit (lucene_hits.Doc (i),
526 lucene_hits.Id (i),
527 lucene_hits.Score (i));
528 all_hits.Add (hit);
531 searcher.Close ();
532 uri_query = new LNS.BooleanQuery ();
533 clause_count = 0;
537 if (clause_count > 0) {
538 searcher = new LNS.IndexSearcher (Store);
539 lucene_hits = searcher.Search (uri_query);
540 int n_hits = lucene_hits.Length ();
542 for (int i = 0; i < n_hits; ++i) {
543 Hit hit = FromLuceneDocToHit (lucene_hits.Doc (i),
544 lucene_hits.Id (i),
545 lucene_hits.Score (i));
546 all_hits.Add (hit);
549 searcher.Close ();
552 return all_hits;
555 public ICollection DoQueryByUri (Uri uri)
557 return DoQueryByUri (new Uri[1] { uri });
560 // We cache the number of documents in the index when readers are
561 // available, so calls to GetItemCount will return immediately
562 // if the driver has been flushed or queried.
563 public int GetItemCount ()
565 if (last_item_count < 0) {
566 IndexReader reader = IndexReader.Open (Store);
567 last_item_count = reader.NumDocs ();
568 reader.Close ();
570 return last_item_count;
574 ///////////////////////////////////////////////////////////////////////////////////////
577 // Code to map to/from Lucene data types
580 private Document ToLuceneDocument (Indexable indexable)
582 indexable.Build ();
584 Document doc = new Document ();
585 Field f;
586 String str;
587 TextReader reader;
589 // First we add the Indexable's 'canonical' properties
590 // to the Document.
592 f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
593 doc.Add (f);
595 f = Field.Keyword ("Type", indexable.Type);
596 doc.Add (f);
598 if (indexable.MimeType != null) {
599 f = Field.Keyword ("MimeType", indexable.MimeType);
600 doc.Add (f);
603 if (indexable.ValidTimestamp) {
604 str = StringFu.DateTimeToString (indexable.Timestamp);
605 f = Field.Keyword ("Timestamp", str);
606 doc.Add (f);
609 if (indexable.ValidRevision) {
610 f = Field.UnIndexed ("Revision",
611 RevisionToString (indexable.Revision));
612 doc.Add (f);
615 reader = indexable.GetTextReader ();
616 if (reader != null) {
617 f = Field.Text ("Text", reader);
618 doc.Add (f);
621 reader = indexable.GetHotTextReader ();
622 if (reader != null) {
623 f = Field.Text ("HotText", reader);
624 doc.Add (f);
627 f = Field.UnStored ("PropertiesText",
628 indexable.TextPropertiesAsString);
629 doc.Add (f);
631 // FIXME: We shouldn't apply stemming, etc. when dealing
632 // with this field.
633 f = Field.UnStored ("PropertiesKeyword",
634 indexable.KeywordPropertiesAsString);
635 doc.Add (f);
637 // FIXME: We need to deal with duplicate properties in some
638 // sort of sane way.
639 foreach (Property prop in indexable.Properties) {
640 if (prop.Value != null) {
641 f = Field.Keyword (ToLucenePropertyKey (prop.Key),
642 prop.Value);
643 doc.Add (f);
647 return doc;
650 static public LNS.Query ToUriQuery (ICollection list_of_uris, UriRemapper remapper)
652 if (list_of_uris == null || list_of_uris.Count == 0)
653 return null;
655 LNS.BooleanQuery query = new LNS.BooleanQuery ();
656 int max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
657 int clause_count = 0;
659 foreach (Uri original_uri in list_of_uris) {
660 Uri uri = original_uri;
661 if (remapper != null)
662 uri = remapper (uri);
663 Term term = new Term ("Uri", uri.ToString ()); // FIXME: Do we need some UriFu here?
664 LNS.Query term_query = new LNS.TermQuery (term);
665 query.Add (term_query, false, false);
666 ++clause_count;
667 // If we have to many clases, nest the queries
668 if (clause_count == max_clauses) {
669 LNS.BooleanQuery new_query = new LNS.BooleanQuery ();
670 new_query.Add (query, false, false);
671 query = new_query;
672 clause_count = 1;
676 return query;
679 static public LNS.Query ToUriQuery (ICollection list_of_uris)
681 return ToUriQuery (list_of_uris, null);
684 private LNS.Query ToCoreLuceneQuery (QueryBody body, string field)
686 LNS.BooleanQuery luceneQuery = null;
687 foreach (string text_orig in body.Text) {
688 string text = text_orig;
690 if (text == null || text == "")
691 continue;
693 bool minus_sign = false;
694 if (text [0] == '-') {
695 text = text.Substring (1);
696 minus_sign = true;
699 // Use the analyzer to extract the query's tokens.
700 // This code is taken from Lucene's query parser.
701 // We use the standard Analyzer.
702 TokenStream source = LuceneDriver.Analyzer.TokenStream (field, new StringReader (text));
703 ArrayList tokens = new ArrayList ();
705 while (true) {
706 Lucene.Net.Analysis.Token t;
707 try {
708 t = source.Next ();
709 } catch (IOException) {
710 t = null;
712 if (t == null)
713 break;
714 tokens.Add (t.TermText ());
716 try {
717 source.Close ();
718 } catch (IOException) {
719 // ignore
722 LNS.Query q = null;
723 if (tokens.Count == 1) {
724 Term t = new Term (field, (string) tokens [0]);
725 q = new LNS.TermQuery (t);
726 } else if (tokens.Count > 1) {
727 q = new LNS.PhraseQuery ();
728 foreach (string tokenStr in tokens) {
729 Term t = new Term (field, tokenStr);
730 ((LNS.PhraseQuery) q).Add (t);
734 if (q != null) {
735 if (luceneQuery == null)
736 luceneQuery = new LNS.BooleanQuery ();
737 luceneQuery.Add (q, !minus_sign, minus_sign);
740 return luceneQuery;
744 // search_subset limits the score of our search to that set of Uris
745 // bonus_uris are always matched by the query
746 private LNS.Query ToLuceneQuery (QueryBody body,
747 ICollection search_subset,
748 ICollection bonus_uris)
750 LNS.BooleanQuery body_query = null;
751 LNS.Query search_subset_query = null;
752 LNS.Query bonus_uris_query = null;
753 LNS.BooleanQuery mime_type_query = null;
754 LNS.BooleanQuery hit_type_query = null;
756 if (body.Text.Count > 0) {
758 body_query = new LNS.BooleanQuery ();
760 LNS.Query q;
762 q = ToCoreLuceneQuery (body, "PropertiesText");
763 if (q != null) {
764 q.SetBoost (2.5f);
765 body_query.Add (q, false, false);
768 q = ToCoreLuceneQuery (body, "PropertiesKeyword");
769 if (q != null) {
770 q.SetBoost (2.5f);
771 body_query.Add (q, false, false);
774 q = ToCoreLuceneQuery (body, "HotText");
775 if (q != null) {
776 q.SetBoost (1.75f);
777 body_query.Add (q, false, false);
780 q = ToCoreLuceneQuery (body, "Text");
781 if (q != null) {
782 body_query.Add (q, false, false);
786 search_subset_query = ToUriQuery (search_subset, null);
788 bonus_uris_query = ToUriQuery (bonus_uris, null);
790 if (body.MimeTypes.Count > 0) {
791 mime_type_query = new LNS.BooleanQuery ();
792 foreach (string mime_type in body.MimeTypes) {
793 Term t = new Term ("MimeType", mime_type);
794 LNS.Query q = new LNS.TermQuery (t);
795 mime_type_query.Add (q, false, false);
799 if (body.HasHitTypes) {
800 hit_type_query = new LNS.BooleanQuery ();
801 foreach (string hit_type in body.HitTypes) {
802 Term t = new Term ("Type", hit_type);
803 LNS.Query q = new LNS.TermQuery (t);
804 hit_type_query.Add (q, false, false);
809 // Now we combine the various parts into one big query.
812 LNS.BooleanQuery total_query = new LNS.BooleanQuery ();
814 // If we have hit types or mime types, those must be matched
815 if (mime_type_query != null)
816 total_query.Add (mime_type_query, true, false);
817 if (hit_type_query != null)
818 total_query.Add (hit_type_query, true, false);
820 // We also must match the "content query":
821 // (body_query OR bonus_uris_query) AND search_subset_query
823 LNS.Query content_query = null;
825 if (body_query != null && bonus_uris_query != null) {
826 LNS.BooleanQuery q = new LNS.BooleanQuery ();
827 q.Add (body_query, false, false);
828 q.Add (bonus_uris_query, false, false);
829 content_query = q;
830 } else if (body_query != null) {
831 content_query = body_query;
832 } else if (bonus_uris_query != null) {
833 content_query = bonus_uris_query;
836 if (content_query != null && search_subset_query != null) {
837 LNS.BooleanQuery q = new LNS.BooleanQuery ();
838 q.Add (content_query, true, false);
839 q.Add (search_subset_query, true, false);
840 content_query = q;
841 } else if (search_subset_query != null) {
842 content_query = search_subset_query;
845 if (content_query != null)
846 total_query.Add (content_query, true, false);
848 return total_query;
851 static private Uri UriFromLuceneDoc (Document doc)
853 string uri = doc.Get ("Uri");
854 if (uri == null)
855 throw new Exception ("Got document from Lucene w/o a URI!");
856 return UriFu.UriStringToUri (uri);
859 static private void FromLuceneDocToVersioned (Document doc, Versioned versioned)
861 string str;
863 str = doc.Get ("Timestamp");
864 if (str != null)
865 versioned.Timestamp = StringFu.StringToDateTime (str);
867 str = doc.Get ("Revision");
868 if (str != null)
869 versioned.Revision = StringToRevision (str);
873 private Hit FromLuceneDocToHit (Document doc, int id, double score)
875 Hit hit = new Hit ();
877 hit.Id = id;
879 string str;
881 FromLuceneDocToVersioned (doc, hit);
883 hit.Uri = UriFromLuceneDoc (doc);
885 str = doc.Get ("Type");
886 if (str == null)
887 throw new Exception ("Got hit from Lucene w/o a Type!");
888 hit.Type = str;
890 hit.MimeType = doc.Get ("MimeType");
892 hit.Source = "lucene";
893 hit.ScoreRaw = score;
895 foreach (Field ff in doc.Fields ()) {
896 string key = FromLucenePropertyKey (ff.Name ());
897 if (key != null)
898 hit [key] = ff.StringValue ();
901 return hit;
905 /////////////////////////////////////////////////////
908 // A common, shared analyzer
911 private class BeagleNoiseFilter : TokenFilter {
913 static int total_count = 0;
914 static int noise_count = 0;
916 TokenStream token_stream;
918 public BeagleNoiseFilter (TokenStream input) : base (input)
920 token_stream = input;
923 // FIXME: we should add some heuristics that are stricter
924 // but explicitly try to avoid filtering out dates,
925 // phone numbers, etc.
926 private static bool IsNoise (string text)
928 // Anything really long is almost certainly noise.
929 if (text.Length > 30)
930 return true;
932 // Look at how often we switch between numbers and letters.
933 // Scoring:
934 // <letter> <digit> 1
935 // <digit> <letter> 1
936 // <x> <punct>+ <x> 1
937 // <x> <punct>+ <y> 2
938 const int transitions_cutoff = 4;
939 int last_type = -1, last_non_punct_type = -1, first_type = -1;
940 bool has_letter = false, has_digit = false, has_punctuation = false;
941 int transitions = 0;
942 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
943 char c = text [i];
944 int type = -1;
945 if (Char.IsLetter (c)) {
946 type = 1;
947 has_letter = true;
948 } else if (Char.IsDigit (c)) {
949 type = 2;
950 has_digit = true;
951 } else if (Char.IsPunctuation (c)) {
952 type = 3;
953 has_punctuation = true;
956 if (type != -1) {
958 if (type != last_type) {
959 if (last_type == 3) {
960 if (type != last_non_punct_type)
961 ++transitions;
962 } else {
963 ++transitions;
967 if (first_type == -1)
968 first_type = type;
970 last_type = type;
971 if (type != 3)
972 last_non_punct_type = type;
976 // If we make too many transitions, it must be noise.
977 if (transitions >= transitions_cutoff)
978 return true;
980 // If we consist of nothing but digits and punctuation, treat it
981 // as noise if it is too long.
982 if (transitions == 1 && first_type != 1 && text.Length > 10)
983 return true;
985 // We are very suspicious of long things that make lots of
986 // transitions
987 if (transitions > 3 && text.Length > 10)
988 return true;
990 // Beware of anything long that contains a little of everything.
991 if (has_letter && has_digit && has_punctuation && text.Length > 10)
992 return true;
994 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
995 return false;
999 public override Lucene.Net.Analysis.Token Next ()
1001 Lucene.Net.Analysis.Token token;
1002 while ( (token = token_stream.Next ()) != null) {
1003 #if false
1004 if (total_count > 0 && total_count % 5000 == 0)
1005 Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
1006 noise_count, total_count, 100.0 * noise_count / total_count);
1007 #endif
1008 ++total_count;
1009 if (IsNoise (token.TermText ())) {
1010 ++noise_count;
1011 continue;
1013 return token;
1015 return null;
1019 // This is just a standard analyzer combined with the Porter stemmer.
1020 // FIXME: This assumes everything being indexed is in English!
1021 private class BeagleAnalyzer : StandardAnalyzer {
1022 public override TokenStream TokenStream (String fieldName, TextReader reader)
1024 TokenStream outstream = base.TokenStream (fieldName, reader);
1025 if (fieldName == "Text" || fieldName == "HotText")
1026 outstream = new BeagleNoiseFilter (outstream);
1027 outstream = new PorterStemFilter (outstream);
1028 return outstream;
1032 private static Analyzer theAnalyzer;
1034 private static Analyzer Analyzer {
1035 get {
1036 if (theAnalyzer == null)
1037 theAnalyzer = new BeagleAnalyzer ();
1038 return theAnalyzer;
1042 /////////////////////////////////////////////////////
1045 // Access to the Stemmer
1048 static PorterStemmer stemmer = new PorterStemmer ();
1050 static public string Stem (string str)
1052 return stemmer.Stem (str);
1055 /////////////////////////////////////////////////////
1058 // Helpful little utility functions
1061 static private String RevisionToString (long rev)
1063 return Convert.ToString (rev);
1066 static private long StringToRevision (String str)
1068 return Convert.ToInt64 (str);
1071 const string propPrefix = "prop:";
1073 private string ToLucenePropertyKey (string key)
1075 return propPrefix + key;
1078 private string FromLucenePropertyKey (string key)
1080 if (key.StartsWith (propPrefix))
1081 return key.Substring (propPrefix.Length);
1082 return null;
1085 /////////////////////////////////////////////////////
1087 // Expose some information for debugging and analytical purposes.
1089 public void WriteIndexTermFrequencies (TextWriter writer)
1091 IndexReader reader = IndexReader.Open (Store);
1092 TermEnum term_enum = reader.Terms ();
1094 Term term;
1095 while (term_enum.Next ()) {
1096 term = term_enum.Term ();
1097 int freq = term_enum.DocFreq ();
1098 writer.WriteLine ("{0} {1} {2}", term.Field (), freq, term.Text ());
1101 reader.Close ();