Tokenize 001234 as 1234. Include a testing function in NoiseFilter to figure out...
[beagle.git] / beagled / LuceneIndexingDriver.cs
blob6774eac271025d445363b8ecfe893c2c429bab7e
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 object flush_lock = new object ();
57 public LuceneIndexingDriver (string index_name, int minor_version, bool build_usercache)
58 : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
65 if (build_usercache)
66 text_cache = TextCache.UserCache;
69 public LuceneIndexingDriver (string index_name, int minor_version)
70 : this (index_name, minor_version, true) { }
72 public LuceneIndexingDriver (string index_name, bool build_usercache)
73 : this (index_name, 0, build_usercache) { }
75 public LuceneIndexingDriver (string index_name)
76 : this (index_name, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt [] Flush (IndexerRequest request)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
88 lock (flush_lock)
89 return Flush_Unlocked (request);
92 private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
94 ArrayList receipt_queue;
95 receipt_queue = new ArrayList ();
97 IndexReader primary_reader, secondary_reader;
98 primary_reader = IndexReader.Open (PrimaryStore);
99 secondary_reader = IndexReader.Open (SecondaryStore);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS.BooleanQuery prop_change_query = null;
109 int delete_count = 0;
111 foreach (Indexable indexable in request.Indexables) {
113 switch (indexable.Type) {
115 case IndexableType.Add:
116 case IndexableType.Remove:
118 string uri_str;
119 uri_str = UriFu.UriToEscapedString (indexable.Uri);
121 Logger.Log.Debug ("-{0}", indexable.DisplayUri);
123 Term term;
124 term = new Term ("Uri", uri_str);
125 delete_count += primary_reader.Delete (term);
126 if (secondary_reader != null)
127 secondary_reader.Delete (term);
129 // When we delete an indexable, also delete any children.
130 // FIXME: Shouldn't we also delete any children of children, etc.?
131 term = new Term ("ParentUri", uri_str);
132 delete_count += primary_reader.Delete (term);
133 if (secondary_reader != null)
134 secondary_reader.Delete (term);
136 // If this is a strict removal (and not a deletion that
137 // we are doing in anticipation of adding something back),
138 // queue up a removed receipt.
139 if (indexable.Type == IndexableType.Remove) {
140 IndexerRemovedReceipt r;
141 r = new IndexerRemovedReceipt (indexable.Uri);
142 receipt_queue.Add (r);
145 break;
147 case IndexableType.PropertyChange:
148 if (prop_change_query == null)
149 prop_change_query = new LNS.BooleanQuery ();
150 prop_change_query.Add (UriQuery ("Uri", indexable.Uri), false, false);
151 break;
155 if (HaveItemCount)
156 AdjustItemCount (-delete_count);
157 else
158 SetItemCount (primary_reader);
160 // Step #2: If we have are doing any property changes,
161 // we read in the current secondary documents and
162 // store them in a hash table for use later. Then we
163 // delete the current secondary documents.
164 Hashtable prop_change_docs = null;
165 if (prop_change_query != null) {
166 prop_change_docs = UriFu.NewHashtable ();
168 LNS.IndexSearcher secondary_searcher;
169 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
171 LNS.Hits hits;
172 hits = secondary_searcher.Search (prop_change_query);
174 ArrayList delete_terms;
175 delete_terms = new ArrayList ();
177 int N = hits.Length ();
178 for (int i = 0; i < N; ++i) {
179 Document doc;
180 doc = hits.Doc (i);
182 string uri_str;
183 uri_str = doc.Get ("Uri");
185 Uri uri;
186 uri = UriFu.EscapedStringToUri (uri_str);
187 prop_change_docs [uri] = doc;
189 Term term;
190 term = new Term ("Uri", uri_str);
191 delete_terms.Add (term);
194 secondary_searcher.Close ();
196 foreach (Term term in delete_terms)
197 secondary_reader.Delete (term);
200 // We are now done with the readers, so we close them.
201 primary_reader.Close ();
202 secondary_reader.Close ();
204 // FIXME: If we crash at exactly this point, we are in
205 // trouble. Items will have been dropped from the index
206 // without the proper replacements being added.
208 // Step #3: Make another pass across our list of indexables
209 // and write out any new documents.
211 if (text_cache != null)
212 text_cache.BeginTransaction ();
214 IndexWriter primary_writer, secondary_writer;
215 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
216 secondary_writer = null;
218 foreach (Indexable indexable in request.Indexables) {
220 if (indexable.Type == IndexableType.Remove)
221 continue;
223 IndexerAddedReceipt r;
224 r = new IndexerAddedReceipt (indexable.Uri);
225 receipt_queue.Add (r);
227 if (indexable.Type == IndexableType.PropertyChange) {
229 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
230 r.PropertyChangesOnly = true;
232 Document doc;
233 doc = prop_change_docs [indexable.Uri] as Document;
235 Document new_doc;
236 new_doc = RewriteDocument (doc, indexable);
238 // Write out the new document...
239 if (secondary_writer == null)
240 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
241 secondary_writer.AddDocument (new_doc);
243 continue; // ...and proceed to the next Indexable
246 // If we reach this point we know we are dealing with an IndexableType.Add
248 if (indexable.Type != IndexableType.Add)
249 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
251 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
253 Filter filter = null;
255 // If we have content, try to find a filter
256 // which we can use to process the indexable.
257 try {
258 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
259 } catch (Exception e) {
260 Logger.Log.Error (e, "Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
261 indexable.NoContent = true;
264 Document primary_doc = null, secondary_doc = null;
266 try {
267 BuildDocuments (indexable, out primary_doc, out secondary_doc);
268 primary_writer.AddDocument (primary_doc);
269 } catch (Exception ex) {
271 // If an exception was thrown, something bad probably happened
272 // while we were filtering the content. Set NoContent to true
273 // and try again -- that way it will at least end up in the index,
274 // even if we don't manage to extract the fulltext.
276 Logger.Log.Debug (ex, "First attempt to index {0} failed", indexable.DisplayUri);
278 indexable.NoContent = true;
280 try {
281 BuildDocuments (indexable, out primary_doc, out secondary_doc);
282 primary_writer.AddDocument (primary_doc);
283 } catch (Exception ex2) {
284 Logger.Log.Debug (ex2, "Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
288 if (filter != null) {
290 // Force the clean-up of temporary files, just in case.
291 filter.Cleanup ();
293 r.FilterName = filter.GetType ().ToString ();
294 r.FilterVersion = filter.Version;
296 // Create a receipt containing any child indexables.
297 if (filter.ChildIndexables.Count > 0) {
298 IndexerChildIndexablesReceipt cr;
299 cr = new IndexerChildIndexablesReceipt (indexable, filter.ChildIndexables);
300 receipt_queue.Add (cr);
304 if (secondary_doc != null) {
305 if (secondary_writer == null)
306 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
308 secondary_writer.AddDocument (secondary_doc);
311 AdjustItemCount (1);
313 // Clean up any temporary files associated with filtering this indexable.
314 indexable.Cleanup ();
317 if (text_cache != null)
318 text_cache.CommitTransaction ();
320 if (request.OptimizeIndex) {
321 Stopwatch watch = new Stopwatch ();
322 Logger.Log.Debug ("Optimizing {0}", IndexName);
323 watch.Start ();
324 primary_writer.Optimize ();
325 if (secondary_writer == null)
326 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
327 secondary_writer.Optimize ();
328 watch.Stop ();
329 Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
332 // Step #4. Close our writers and return the events to
333 // indicate what has happened.
335 primary_writer.Close ();
336 if (secondary_writer != null)
337 secondary_writer.Close ();
339 IndexerReceipt [] receipt_array;
340 receipt_array = new IndexerReceipt [receipt_queue.Count];
341 for (int i = 0; i < receipt_queue.Count; ++i)
342 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
344 return receipt_array;
347 ////////////////////////////////////////////////////////////////
349 public void OptimizeNow ()
351 IndexWriter writer;
353 writer = new IndexWriter (PrimaryStore, null, false);
354 writer.Optimize ();
355 writer.Close ();
357 if (SecondaryStore != null) {
358 writer = new IndexWriter (SecondaryStore, null, false);
359 writer.Optimize ();
360 writer.Close ();
365 public void Merge (LuceneCommon index_to_merge)
367 // FIXME: Error recovery
369 // Merge the primary index
370 IndexWriter primary_writer;
371 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
372 primary_writer = new IndexWriter (PrimaryStore, null, false);
374 primary_writer.AddIndexes (primary_store);
375 primary_writer.Close ();
377 // Merge the secondary index
378 IndexWriter secondary_writer;
379 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
380 secondary_writer = new IndexWriter (SecondaryStore, null, false);
382 secondary_writer.AddIndexes (secondary_store);
383 secondary_writer.Close ();