Soon all this datetime clumsiness will be over.
[beagle.git] / beagled / LuceneIndexingDriver.cs
blob8474c7b2b2d083de10dab4125626670a7e72ab8a
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 object flush_lock = new object ();
57 public LuceneIndexingDriver (string index_name, int minor_version, bool build_usercache)
58 : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
65 if (build_usercache)
66 text_cache = TextCache.UserCache;
69 public LuceneIndexingDriver (string index_name, int minor_version)
70 : this (index_name, minor_version, true) { }
72 public LuceneIndexingDriver (string index_name, bool build_usercache)
73 : this (index_name, 0, build_usercache) { }
75 public LuceneIndexingDriver (string index_name)
76 : this (index_name, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt [] Flush (IndexerRequest request)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
88 lock (flush_lock)
89 return Flush_Unlocked (request);
92 private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
94 ArrayList receipt_queue;
95 receipt_queue = new ArrayList ();
97 IndexReader primary_reader, secondary_reader;
98 primary_reader = IndexReader.Open (PrimaryStore);
99 secondary_reader = IndexReader.Open (SecondaryStore);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS.BooleanQuery prop_change_query = null;
109 LNS.BooleanQuery prop_change_children_query = null;
110 int delete_count = 0;
112 foreach (Indexable indexable in request.Indexables) {
114 switch (indexable.Type) {
116 case IndexableType.Add:
117 case IndexableType.Remove:
119 string uri_str;
120 uri_str = UriFu.UriToEscapedString (indexable.Uri);
122 Logger.Log.Debug ("-{0}", indexable.DisplayUri);
124 Term term;
125 term = new Term ("Uri", uri_str);
126 delete_count += primary_reader.Delete (term);
127 if (secondary_reader != null)
128 secondary_reader.Delete (term);
130 // When we delete an indexable, also delete any children.
131 // FIXME: Shouldn't we also delete any children of children, etc.?
132 term = new Term ("ParentUri", uri_str);
133 delete_count += primary_reader.Delete (term);
134 if (secondary_reader != null)
135 secondary_reader.Delete (term);
137 // If this is a strict removal (and not a deletion that
138 // we are doing in anticipation of adding something back),
139 // queue up a removed receipt.
140 if (indexable.Type == IndexableType.Remove) {
141 IndexerRemovedReceipt r;
142 r = new IndexerRemovedReceipt (indexable.Uri);
143 receipt_queue.Add (r);
146 break;
148 case IndexableType.PropertyChange:
149 if (prop_change_query == null) {
150 prop_change_query = new LNS.BooleanQuery ();
151 prop_change_children_query = new LNS.BooleanQuery ();
154 prop_change_query.Add (UriQuery ("Uri", indexable.Uri), false, false);
155 prop_change_children_query.Add (UriQuery ("ParentUri", indexable.Uri), false, false);
156 break;
160 if (HaveItemCount)
161 AdjustItemCount (-delete_count);
162 else
163 SetItemCount (primary_reader);
165 // Step #2: If we have are doing any property changes,
166 // we read in the current secondary documents and
167 // store them in a hash table for use later. Then we
168 // delete the current secondary documents.
169 Hashtable prop_change_docs = null;
170 Hashtable prop_change_children_docs = null;
171 if (prop_change_query != null) {
172 prop_change_docs = UriFu.NewHashtable ();
174 LNS.IndexSearcher secondary_searcher;
175 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
177 LNS.Hits hits;
178 hits = secondary_searcher.Search (prop_change_query);
180 ArrayList delete_terms;
181 delete_terms = new ArrayList ();
183 int N = hits.Length ();
184 Document doc;
185 for (int i = 0; i < N; ++i) {
186 doc = hits.Doc (i);
188 string uri_str;
189 uri_str = doc.Get ("Uri");
191 Uri uri;
192 uri = UriFu.EscapedStringToUri (uri_str);
193 prop_change_docs [uri] = doc;
195 Term term;
196 term = new Term ("Uri", uri_str);
197 delete_terms.Add (term);
200 secondary_searcher.Close ();
202 foreach (Term term in delete_terms)
203 secondary_reader.Delete (term);
205 // Step #2.5: Find all child indexables for this document
206 // Store them to send them later as IndexerChildIndexablesReceipts
207 prop_change_children_docs = UriFu.NewHashtable ();
209 hits = secondary_searcher.Search (prop_change_children_query);
210 N = hits.Length ();
212 for (int i = 0; i < N; ++i) {
213 doc = hits.Doc (i);
215 string uri_str, parent_uri_str;
216 uri_str = doc.Get ("Uri");
217 parent_uri_str = doc.Get ("ParentUri");
219 Uri uri, parent_uri;
220 uri = UriFu.EscapedStringToUri (uri_str);
221 parent_uri = UriFu.EscapedStringToUri (parent_uri_str);
223 if (! prop_change_children_docs.Contains (parent_uri)) {
224 ArrayList c_list = new ArrayList ();
225 prop_change_children_docs [parent_uri] = c_list;
228 ArrayList children_list = (ArrayList) prop_change_children_docs [parent_uri];
229 children_list.Add (uri);
232 secondary_searcher.Close ();
236 // We are now done with the readers, so we close them.
237 primary_reader.Close ();
238 secondary_reader.Close ();
240 // FIXME: If we crash at exactly this point, we are in
241 // trouble. Items will have been dropped from the index
242 // without the proper replacements being added.
244 // Step #3: Make another pass across our list of indexables
245 // and write out any new documents.
247 if (text_cache != null)
248 text_cache.BeginTransaction ();
250 IndexWriter primary_writer, secondary_writer;
251 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
252 secondary_writer = null;
254 foreach (Indexable indexable in request.Indexables) {
256 if (indexable.Type == IndexableType.Remove)
257 continue;
259 IndexerAddedReceipt r;
260 r = new IndexerAddedReceipt (indexable.Uri);
261 receipt_queue.Add (r);
263 if (indexable.Type == IndexableType.PropertyChange) {
265 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
266 r.PropertyChangesOnly = true;
268 Document doc;
269 doc = prop_change_docs [indexable.Uri] as Document;
271 Document new_doc;
272 new_doc = RewriteDocument (doc, indexable);
274 // Write out the new document...
275 if (secondary_writer == null)
276 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
277 secondary_writer.AddDocument (new_doc);
279 // Add children property change indexables...
280 AddChildrenPropertyChange (
281 prop_change_children_docs,
282 indexable,
283 receipt_queue);
285 continue; // ...and proceed to the next Indexable
288 // If we reach this point we know we are dealing with an IndexableType.Add
290 if (indexable.Type != IndexableType.Add)
291 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
293 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
295 Filter filter = null;
297 // If we have content, try to find a filter
298 // which we can use to process the indexable.
299 try {
300 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
301 } catch (Exception e) {
302 Logger.Log.Error (e, "Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
303 indexable.NoContent = true;
306 Document primary_doc = null, secondary_doc = null;
308 try {
309 BuildDocuments (indexable, out primary_doc, out secondary_doc);
310 primary_writer.AddDocument (primary_doc);
311 } catch (Exception ex) {
313 // If an exception was thrown, something bad probably happened
314 // while we were filtering the content. Set NoContent to true
315 // and try again -- that way it will at least end up in the index,
316 // even if we don't manage to extract the fulltext.
318 Logger.Log.Debug (ex, "First attempt to index {0} failed", indexable.DisplayUri);
320 indexable.NoContent = true;
322 try {
323 BuildDocuments (indexable, out primary_doc, out secondary_doc);
324 primary_writer.AddDocument (primary_doc);
325 } catch (Exception ex2) {
326 Logger.Log.Debug (ex2, "Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
330 if (filter != null) {
332 // Force the clean-up of temporary files, just in case.
333 filter.Cleanup ();
335 r.FilterName = filter.GetType ().ToString ();
336 r.FilterVersion = filter.Version;
338 // Create a receipt containing any child indexables.
339 if (filter.ChildIndexables.Count > 0) {
340 Log.Debug ("{0} (filtered with {1}) has generated {2} child indexable{3}", indexable.DisplayUri, r.FilterName, filter.ChildIndexables.Count, filter.ChildIndexables.Count > 1 ? "s" : "");
341 IndexerChildIndexablesReceipt cr;
342 cr = new IndexerChildIndexablesReceipt (indexable, filter.ChildIndexables);
343 receipt_queue.Add (cr);
347 if (secondary_doc != null) {
348 if (secondary_writer == null)
349 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
351 secondary_writer.AddDocument (secondary_doc);
354 AdjustItemCount (1);
356 // Clean up any temporary files associated with filtering this indexable.
357 indexable.Cleanup ();
360 if (text_cache != null)
361 text_cache.CommitTransaction ();
363 if (request.OptimizeIndex) {
364 Stopwatch watch = new Stopwatch ();
365 Logger.Log.Debug ("Optimizing {0}", IndexName);
366 watch.Start ();
367 primary_writer.Optimize ();
368 if (secondary_writer == null)
369 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
370 secondary_writer.Optimize ();
371 watch.Stop ();
372 Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
375 // Step #4. Close our writers and return the events to
376 // indicate what has happened.
378 primary_writer.Close ();
379 if (secondary_writer != null)
380 secondary_writer.Close ();
382 IndexerReceipt [] receipt_array;
383 receipt_array = new IndexerReceipt [receipt_queue.Count];
384 for (int i = 0; i < receipt_queue.Count; ++i)
385 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
387 return receipt_array;
390 // Since some parent properties maybe stored in child properties
391 // as parent: property, any property change should be propagated
392 // to all its children as well.
393 private void AddChildrenPropertyChange (
394 Hashtable children_docs,
395 Indexable parent,
396 ArrayList receipt_queue)
398 if (! children_docs.Contains (parent.Uri))
399 return;
401 ArrayList children_list = (ArrayList) children_docs [parent.Uri];
402 IndexerChildIndexablesReceipt child_r;
403 child_r = new IndexerChildIndexablesReceipt ();
404 ArrayList child_indexable_list = new ArrayList ();
406 foreach (Uri uri in children_list) {
407 Indexable child_indexable;
408 child_indexable = new Indexable (IndexableType.PropertyChange, uri);
409 Log.Debug ("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri);
411 child_indexable.SetChildOf (parent);
412 child_indexable_list.Add (child_indexable);
415 child_r.Children = child_indexable_list;
416 receipt_queue.Add (child_r);
419 ////////////////////////////////////////////////////////////////
421 public void OptimizeNow ()
423 IndexWriter writer;
425 writer = new IndexWriter (PrimaryStore, null, false);
426 writer.Optimize ();
427 writer.Close ();
429 if (SecondaryStore != null) {
430 writer = new IndexWriter (SecondaryStore, null, false);
431 writer.Optimize ();
432 writer.Close ();
437 public void Merge (LuceneCommon index_to_merge)
439 // FIXME: Error recovery
441 // Merge the primary index
442 IndexWriter primary_writer;
443 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
444 primary_writer = new IndexWriter (PrimaryStore, null, false);
446 primary_writer.AddIndexes (primary_store);
447 primary_writer.Close ();
449 // Merge the secondary index
450 IndexWriter secondary_writer;
451 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
452 secondary_writer = new IndexWriter (SecondaryStore, null, false);
454 secondary_writer.AddIndexes (secondary_store);
455 secondary_writer.Close ();