cvsimport
[beagle.git] / beagled / LuceneIndexingDriver.cs
blob2d55e299f3618b696ae49a90795fabdebafaf1c1
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 object flush_lock = new object ();
57 public LuceneIndexingDriver (string index_name, int minor_version, bool build_usercache)
58 : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
65 if (build_usercache)
66 text_cache = TextCache.UserCache;
69 public LuceneIndexingDriver (string index_name, int minor_version)
70 : this (index_name, minor_version, true) { }
72 public LuceneIndexingDriver (string index_name, bool build_usercache)
73 : this (index_name, 0, build_usercache) { }
75 public LuceneIndexingDriver (string index_name)
76 : this (index_name, 0, true) { }
78 ////////////////////////////////////////////////////////////////
80 // We use this in the index helper so that we can report what's
81 // going on if the helper spins the CPU. The method will be
82 // called with null parameters after filtering has finished.
84 public delegate void FileFilterDelegate (Uri display_uri, Filter filter);
85 public FileFilterDelegate FileFilterNotifier = null;
87 ////////////////////////////////////////////////////////////////
90 // Implementation of the IIndexer interface
93 public IndexerReceipt [] Flush (IndexerRequest request)
95 // This is just to keep a big block of code from being
96 // indented an extra eight spaces.
97 lock (flush_lock)
98 return Flush_Unlocked (request);
101 private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
103 ArrayList receipt_queue;
104 receipt_queue = new ArrayList ();
106 IndexReader primary_reader, secondary_reader;
107 primary_reader = IndexReader.Open (PrimaryStore);
108 secondary_reader = IndexReader.Open (SecondaryStore);
110 // Step #1: Make our first pass over the list of
111 // indexables that make up our request. For each add
112 // or remove in the request, delete the associated
113 // items from the index. Assemble a query that will
114 // be used to find the secondary documents for any
115 // property change requests.
117 LNS.BooleanQuery prop_change_query = null;
118 LNS.BooleanQuery prop_change_children_query = null;
119 int delete_count = 0;
121 ICollection request_indexables = request.Indexables;
123 foreach (Indexable indexable in request_indexables) {
125 switch (indexable.Type) {
127 case IndexableType.Add:
128 case IndexableType.Remove:
130 string uri_str;
131 uri_str = UriFu.UriToEscapedString (indexable.Uri);
133 Logger.Log.Debug ("-{0}", indexable.DisplayUri);
135 Term term;
136 term = new Term ("Uri", uri_str);
137 delete_count += primary_reader.Delete (term);
138 if (secondary_reader != null)
139 secondary_reader.Delete (term);
141 // When we delete an indexable, also delete any children.
142 // FIXME: Shouldn't we also delete any children of children, etc.?
143 term = new Term ("ParentUri", uri_str);
144 delete_count += primary_reader.Delete (term);
145 if (secondary_reader != null)
146 secondary_reader.Delete (term);
148 // If this is a strict removal (and not a deletion that
149 // we are doing in anticipation of adding something back),
150 // queue up a removed receipt.
151 if (indexable.Type == IndexableType.Remove) {
152 IndexerRemovedReceipt r;
153 r = new IndexerRemovedReceipt (indexable.Uri);
154 receipt_queue.Add (r);
157 break;
159 case IndexableType.PropertyChange:
160 if (prop_change_query == null) {
161 prop_change_query = new LNS.BooleanQuery ();
162 prop_change_children_query = new LNS.BooleanQuery ();
165 prop_change_query.Add (UriQuery ("Uri", indexable.Uri), false, false);
166 prop_change_children_query.Add (UriQuery ("ParentUri", indexable.Uri), false, false);
167 break;
171 if (HaveItemCount)
172 AdjustItemCount (-delete_count);
173 else
174 SetItemCount (primary_reader);
176 // Step #2: If we have are doing any property changes,
177 // we read in the current secondary documents and
178 // store them in a hash table for use later. Then we
179 // delete the current secondary documents.
180 Hashtable prop_change_docs = null;
181 Hashtable prop_change_children_docs = null;
182 if (prop_change_query != null) {
183 prop_change_docs = UriFu.NewHashtable ();
185 LNS.IndexSearcher secondary_searcher;
186 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
188 LNS.Hits hits;
189 hits = secondary_searcher.Search (prop_change_query);
191 ArrayList delete_terms;
192 delete_terms = new ArrayList ();
194 int N = hits.Length ();
195 Document doc;
196 for (int i = 0; i < N; ++i) {
197 doc = hits.Doc (i);
199 string uri_str;
200 uri_str = doc.Get ("Uri");
202 Uri uri;
203 uri = UriFu.EscapedStringToUri (uri_str);
204 prop_change_docs [uri] = doc;
206 Term term;
207 term = new Term ("Uri", uri_str);
208 delete_terms.Add (term);
211 secondary_searcher.Close ();
213 foreach (Term term in delete_terms)
214 secondary_reader.Delete (term);
216 // Step #2.5: Find all child indexables for this document
217 // Store them to send them later as IndexerChildIndexablesReceipts
218 prop_change_children_docs = UriFu.NewHashtable ();
220 hits = secondary_searcher.Search (prop_change_children_query);
221 N = hits.Length ();
223 for (int i = 0; i < N; ++i) {
224 doc = hits.Doc (i);
226 string uri_str, parent_uri_str;
227 uri_str = doc.Get ("Uri");
228 parent_uri_str = doc.Get ("ParentUri");
230 Uri uri, parent_uri;
231 uri = UriFu.EscapedStringToUri (uri_str);
232 parent_uri = UriFu.EscapedStringToUri (parent_uri_str);
234 if (! prop_change_children_docs.Contains (parent_uri)) {
235 ArrayList c_list = new ArrayList ();
236 prop_change_children_docs [parent_uri] = c_list;
239 ArrayList children_list = (ArrayList) prop_change_children_docs [parent_uri];
240 children_list.Add (uri);
243 secondary_searcher.Close ();
247 // We are now done with the readers, so we close them.
248 primary_reader.Close ();
249 secondary_reader.Close ();
251 // FIXME: If we crash at exactly this point, we are in
252 // trouble. Items will have been dropped from the index
253 // without the proper replacements being added.
255 // Step #3: Make another pass across our list of indexables
256 // and write out any new documents.
258 if (text_cache != null)
259 text_cache.BeginTransaction ();
261 IndexWriter primary_writer, secondary_writer;
262 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
263 secondary_writer = null;
265 foreach (Indexable indexable in request_indexables) {
267 if (indexable.Type == IndexableType.Remove)
268 continue;
270 IndexerAddedReceipt r;
271 r = new IndexerAddedReceipt (indexable.Uri);
272 receipt_queue.Add (r);
274 if (indexable.Type == IndexableType.PropertyChange) {
276 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
277 r.PropertyChangesOnly = true;
279 Document doc;
280 doc = prop_change_docs [indexable.Uri] as Document;
282 Document new_doc;
283 new_doc = RewriteDocument (doc, indexable);
285 // Write out the new document...
286 if (secondary_writer == null)
287 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
288 secondary_writer.AddDocument (new_doc);
290 // Add children property change indexables...
291 AddChildrenPropertyChange (
292 prop_change_children_docs,
293 indexable,
294 receipt_queue);
296 continue; // ...and proceed to the next Indexable
299 // If we reach this point we know we are dealing with an IndexableType.Add
301 if (indexable.Type != IndexableType.Add)
302 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
304 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
306 Filter filter = null;
308 if (FileFilterNotifier != null)
309 FileFilterNotifier (indexable.DisplayUri, null); // We don't know what filter yet.
311 // If we have content, try to find a filter
312 // which we can use to process the indexable.
313 try {
314 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
315 } catch (Exception e) {
316 Logger.Log.Error (e, "Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
317 indexable.NoContent = true;
320 if (FileFilterNotifier != null)
321 FileFilterNotifier (indexable.DisplayUri, filter); // Update with our filter
323 Document primary_doc = null, secondary_doc = null;
325 try {
326 BuildDocuments (indexable, out primary_doc, out secondary_doc);
327 primary_writer.AddDocument (primary_doc);
328 } catch (Exception ex) {
330 // If an exception was thrown, something bad probably happened
331 // while we were filtering the content. Set NoContent to true
332 // and try again -- that way it will at least end up in the index,
333 // even if we don't manage to extract the fulltext.
335 Logger.Log.Debug (ex, "First attempt to index {0} failed", indexable.DisplayUri);
337 indexable.NoContent = true;
339 try {
340 BuildDocuments (indexable, out primary_doc, out secondary_doc);
341 primary_writer.AddDocument (primary_doc);
342 } catch (Exception ex2) {
343 Logger.Log.Debug (ex2, "Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
347 if (filter != null) {
349 // Force the clean-up of temporary files, just in case.
350 filter.Cleanup ();
352 r.FilterName = filter.GetType ().ToString ();
353 r.FilterVersion = filter.Version;
355 // Create a receipt containing any child indexables.
356 if (filter.ChildIndexables.Count > 0) {
357 Log.Debug ("Generated {0} child indexable{1} from {2} (filtered with {3})", filter.ChildIndexables.Count, filter.ChildIndexables.Count > 1 ? "s" : "", indexable.DisplayUri, r.FilterName);
358 IndexerChildIndexablesReceipt cr;
359 cr = new IndexerChildIndexablesReceipt (indexable, filter.ChildIndexables);
360 receipt_queue.Add (cr);
364 if (FileFilterNotifier != null)
365 FileFilterNotifier (null, null); // reset
367 if (secondary_doc != null) {
368 if (secondary_writer == null)
369 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
371 secondary_writer.AddDocument (secondary_doc);
374 AdjustItemCount (1);
376 // Clean up any temporary files associated with filtering this indexable.
377 indexable.Cleanup ();
380 if (text_cache != null)
381 text_cache.CommitTransaction ();
383 if (request.OptimizeIndex) {
384 Stopwatch watch = new Stopwatch ();
385 Logger.Log.Debug ("Optimizing {0}", IndexName);
386 watch.Start ();
387 primary_writer.Optimize ();
388 if (secondary_writer == null)
389 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
390 secondary_writer.Optimize ();
391 watch.Stop ();
392 Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
395 // Step #4. Close our writers and return the events to
396 // indicate what has happened.
398 primary_writer.Close ();
399 if (secondary_writer != null)
400 secondary_writer.Close ();
402 IndexerReceipt [] receipt_array;
403 receipt_array = new IndexerReceipt [receipt_queue.Count];
404 for (int i = 0; i < receipt_queue.Count; ++i)
405 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
407 return receipt_array;
410 // Since some parent properties maybe stored in child properties
411 // as parent: property, any property change should be propagated
412 // to all its children as well.
413 private void AddChildrenPropertyChange (
414 Hashtable children_docs,
415 Indexable parent,
416 ArrayList receipt_queue)
418 if (! children_docs.Contains (parent.Uri))
419 return;
421 ArrayList children_list = (ArrayList) children_docs [parent.Uri];
422 IndexerChildIndexablesReceipt child_r;
423 child_r = new IndexerChildIndexablesReceipt ();
424 ArrayList child_indexable_list = new ArrayList ();
426 foreach (Uri uri in children_list) {
427 Indexable child_indexable;
428 child_indexable = new Indexable (IndexableType.PropertyChange, uri);
429 Log.Debug ("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri);
431 child_indexable.SetChildOf (parent);
432 child_indexable_list.Add (child_indexable);
435 child_r.Children = child_indexable_list;
436 receipt_queue.Add (child_r);
439 ////////////////////////////////////////////////////////////////
441 public void OptimizeNow ()
443 IndexWriter writer;
445 writer = new IndexWriter (PrimaryStore, null, false);
446 writer.Optimize ();
447 writer.Close ();
449 if (SecondaryStore != null) {
450 writer = new IndexWriter (SecondaryStore, null, false);
451 writer.Optimize ();
452 writer.Close ();
457 public void Merge (LuceneCommon index_to_merge)
459 // FIXME: Error recovery
461 // Merge the primary index
462 IndexWriter primary_writer;
463 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
464 primary_writer = new IndexWriter (PrimaryStore, null, false);
466 primary_writer.AddIndexes (primary_store);
467 primary_writer.Close ();
469 // Merge the secondary index
470 IndexWriter secondary_writer;
471 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
472 secondary_writer = new IndexWriter (SecondaryStore, null, false);
474 secondary_writer.AddIndexes (secondary_store);
475 secondary_writer.Close ();