Add a count to the number of child indexables found with
[beagle.git] / beagled / LuceneIndexingDriver.cs
blob6fc2c720fab2ed8d3e752f99ca2ba0279c2fbeb9
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 object flush_lock = new object ();
57 public LuceneIndexingDriver (string index_name, int minor_version, bool build_usercache)
58 : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
65 if (build_usercache)
66 text_cache = TextCache.UserCache;
69 public LuceneIndexingDriver (string index_name, int minor_version)
70 : this (index_name, minor_version, true) { }
72 public LuceneIndexingDriver (string index_name, bool build_usercache)
73 : this (index_name, 0, build_usercache) { }
75 public LuceneIndexingDriver (string index_name)
76 : this (index_name, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt [] Flush (IndexerRequest request)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
88 lock (flush_lock)
89 return Flush_Unlocked (request);
92 private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
94 ArrayList receipt_queue;
95 receipt_queue = new ArrayList ();
97 IndexReader primary_reader, secondary_reader;
98 primary_reader = IndexReader.Open (PrimaryStore);
99 secondary_reader = IndexReader.Open (SecondaryStore);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS.BooleanQuery prop_change_query = null;
109 LNS.BooleanQuery prop_change_children_query = null;
110 int delete_count = 0;
112 ICollection request_indexables = request.Indexables;
114 foreach (Indexable indexable in request_indexables) {
116 switch (indexable.Type) {
118 case IndexableType.Add:
119 case IndexableType.Remove:
121 string uri_str;
122 uri_str = UriFu.UriToEscapedString (indexable.Uri);
124 Logger.Log.Debug ("-{0}", indexable.DisplayUri);
126 Term term;
127 term = new Term ("Uri", uri_str);
128 delete_count += primary_reader.Delete (term);
129 if (secondary_reader != null)
130 secondary_reader.Delete (term);
132 // When we delete an indexable, also delete any children.
133 // FIXME: Shouldn't we also delete any children of children, etc.?
134 term = new Term ("ParentUri", uri_str);
135 delete_count += primary_reader.Delete (term);
136 if (secondary_reader != null)
137 secondary_reader.Delete (term);
139 // If this is a strict removal (and not a deletion that
140 // we are doing in anticipation of adding something back),
141 // queue up a removed receipt.
142 if (indexable.Type == IndexableType.Remove) {
143 IndexerRemovedReceipt r;
144 r = new IndexerRemovedReceipt (indexable.Uri);
145 receipt_queue.Add (r);
148 break;
150 case IndexableType.PropertyChange:
151 if (prop_change_query == null) {
152 prop_change_query = new LNS.BooleanQuery ();
153 prop_change_children_query = new LNS.BooleanQuery ();
156 prop_change_query.Add (UriQuery ("Uri", indexable.Uri), false, false);
157 prop_change_children_query.Add (UriQuery ("ParentUri", indexable.Uri), false, false);
158 break;
162 if (HaveItemCount)
163 AdjustItemCount (-delete_count);
164 else
165 SetItemCount (primary_reader);
167 // Step #2: If we have are doing any property changes,
168 // we read in the current secondary documents and
169 // store them in a hash table for use later. Then we
170 // delete the current secondary documents.
171 Hashtable prop_change_docs = null;
172 Hashtable prop_change_children_docs = null;
173 if (prop_change_query != null) {
174 prop_change_docs = UriFu.NewHashtable ();
176 LNS.IndexSearcher secondary_searcher;
177 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
179 LNS.Hits hits;
180 hits = secondary_searcher.Search (prop_change_query);
182 ArrayList delete_terms;
183 delete_terms = new ArrayList ();
185 int N = hits.Length ();
186 Document doc;
187 for (int i = 0; i < N; ++i) {
188 doc = hits.Doc (i);
190 string uri_str;
191 uri_str = doc.Get ("Uri");
193 Uri uri;
194 uri = UriFu.EscapedStringToUri (uri_str);
195 prop_change_docs [uri] = doc;
197 Term term;
198 term = new Term ("Uri", uri_str);
199 delete_terms.Add (term);
202 secondary_searcher.Close ();
204 foreach (Term term in delete_terms)
205 secondary_reader.Delete (term);
207 // Step #2.5: Find all child indexables for this document
208 // Store them to send them later as IndexerChildIndexablesReceipts
209 prop_change_children_docs = UriFu.NewHashtable ();
211 hits = secondary_searcher.Search (prop_change_children_query);
212 N = hits.Length ();
214 for (int i = 0; i < N; ++i) {
215 doc = hits.Doc (i);
217 string uri_str, parent_uri_str;
218 uri_str = doc.Get ("Uri");
219 parent_uri_str = doc.Get ("ParentUri");
221 Uri uri, parent_uri;
222 uri = UriFu.EscapedStringToUri (uri_str);
223 parent_uri = UriFu.EscapedStringToUri (parent_uri_str);
225 if (! prop_change_children_docs.Contains (parent_uri)) {
226 ArrayList c_list = new ArrayList ();
227 prop_change_children_docs [parent_uri] = c_list;
230 ArrayList children_list = (ArrayList) prop_change_children_docs [parent_uri];
231 children_list.Add (uri);
234 secondary_searcher.Close ();
238 // We are now done with the readers, so we close them.
239 primary_reader.Close ();
240 secondary_reader.Close ();
242 // FIXME: If we crash at exactly this point, we are in
243 // trouble. Items will have been dropped from the index
244 // without the proper replacements being added.
246 // Step #3: Make another pass across our list of indexables
247 // and write out any new documents.
249 if (text_cache != null)
250 text_cache.BeginTransaction ();
252 IndexWriter primary_writer, secondary_writer;
253 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
254 secondary_writer = null;
256 foreach (Indexable indexable in request_indexables) {
258 if (indexable.Type == IndexableType.Remove)
259 continue;
261 IndexerAddedReceipt r;
262 r = new IndexerAddedReceipt (indexable.Uri);
263 receipt_queue.Add (r);
265 if (indexable.Type == IndexableType.PropertyChange) {
267 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
268 r.PropertyChangesOnly = true;
270 Document doc;
271 doc = prop_change_docs [indexable.Uri] as Document;
273 Document new_doc;
274 new_doc = RewriteDocument (doc, indexable);
276 // Write out the new document...
277 if (secondary_writer == null)
278 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
279 secondary_writer.AddDocument (new_doc);
281 // Add children property change indexables...
282 AddChildrenPropertyChange (
283 prop_change_children_docs,
284 indexable,
285 receipt_queue);
287 continue; // ...and proceed to the next Indexable
290 // If we reach this point we know we are dealing with an IndexableType.Add
292 if (indexable.Type != IndexableType.Add)
293 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
295 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
297 Filter filter = null;
299 // If we have content, try to find a filter
300 // which we can use to process the indexable.
301 try {
302 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
303 } catch (Exception e) {
304 Logger.Log.Error (e, "Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
305 indexable.NoContent = true;
308 Document primary_doc = null, secondary_doc = null;
310 try {
311 BuildDocuments (indexable, out primary_doc, out secondary_doc);
312 primary_writer.AddDocument (primary_doc);
313 } catch (Exception ex) {
315 // If an exception was thrown, something bad probably happened
316 // while we were filtering the content. Set NoContent to true
317 // and try again -- that way it will at least end up in the index,
318 // even if we don't manage to extract the fulltext.
320 Logger.Log.Debug (ex, "First attempt to index {0} failed", indexable.DisplayUri);
322 indexable.NoContent = true;
324 try {
325 BuildDocuments (indexable, out primary_doc, out secondary_doc);
326 primary_writer.AddDocument (primary_doc);
327 } catch (Exception ex2) {
328 Logger.Log.Debug (ex2, "Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
332 if (filter != null) {
334 // Force the clean-up of temporary files, just in case.
335 filter.Cleanup ();
337 r.FilterName = filter.GetType ().ToString ();
338 r.FilterVersion = filter.Version;
340 // Create a receipt containing any child indexables.
341 if (filter.ChildIndexables.Count > 0) {
342 Log.Debug ("Generated {0} child indexable{1} from {2} (filtered with {3})", filter.ChildIndexables.Count, filter.ChildIndexables.Count > 1 ? "s" : "", indexable.DisplayUri, r.FilterName);
343 IndexerChildIndexablesReceipt cr;
344 cr = new IndexerChildIndexablesReceipt (indexable, filter.ChildIndexables);
345 receipt_queue.Add (cr);
349 if (secondary_doc != null) {
350 if (secondary_writer == null)
351 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
353 secondary_writer.AddDocument (secondary_doc);
356 AdjustItemCount (1);
358 // Clean up any temporary files associated with filtering this indexable.
359 indexable.Cleanup ();
362 if (text_cache != null)
363 text_cache.CommitTransaction ();
365 if (request.OptimizeIndex) {
366 Stopwatch watch = new Stopwatch ();
367 Logger.Log.Debug ("Optimizing {0}", IndexName);
368 watch.Start ();
369 primary_writer.Optimize ();
370 if (secondary_writer == null)
371 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
372 secondary_writer.Optimize ();
373 watch.Stop ();
374 Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
377 // Step #4. Close our writers and return the events to
378 // indicate what has happened.
380 primary_writer.Close ();
381 if (secondary_writer != null)
382 secondary_writer.Close ();
384 IndexerReceipt [] receipt_array;
385 receipt_array = new IndexerReceipt [receipt_queue.Count];
386 for (int i = 0; i < receipt_queue.Count; ++i)
387 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
389 return receipt_array;
392 // Since some parent properties maybe stored in child properties
393 // as parent: property, any property change should be propagated
394 // to all its children as well.
395 private void AddChildrenPropertyChange (
396 Hashtable children_docs,
397 Indexable parent,
398 ArrayList receipt_queue)
400 if (! children_docs.Contains (parent.Uri))
401 return;
403 ArrayList children_list = (ArrayList) children_docs [parent.Uri];
404 IndexerChildIndexablesReceipt child_r;
405 child_r = new IndexerChildIndexablesReceipt ();
406 ArrayList child_indexable_list = new ArrayList ();
408 foreach (Uri uri in children_list) {
409 Indexable child_indexable;
410 child_indexable = new Indexable (IndexableType.PropertyChange, uri);
411 Log.Debug ("Creating property change child indexable for {1} (parent {0})", parent.Uri, uri);
413 child_indexable.SetChildOf (parent);
414 child_indexable_list.Add (child_indexable);
417 child_r.Children = child_indexable_list;
418 receipt_queue.Add (child_r);
421 ////////////////////////////////////////////////////////////////
423 public void OptimizeNow ()
425 IndexWriter writer;
427 writer = new IndexWriter (PrimaryStore, null, false);
428 writer.Optimize ();
429 writer.Close ();
431 if (SecondaryStore != null) {
432 writer = new IndexWriter (SecondaryStore, null, false);
433 writer.Optimize ();
434 writer.Close ();
439 public void Merge (LuceneCommon index_to_merge)
441 // FIXME: Error recovery
443 // Merge the primary index
444 IndexWriter primary_writer;
445 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
446 primary_writer = new IndexWriter (PrimaryStore, null, false);
448 primary_writer.AddIndexes (primary_store);
449 primary_writer.Close ();
451 // Merge the secondary index
452 IndexWriter secondary_writer;
453 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
454 secondary_writer = new IndexWriter (SecondaryStore, null, false);
456 secondary_writer.AddIndexes (secondary_store);
457 secondary_writer.Close ();