Oops, fix a broken part of the patch
[beagle.git] / beagled / LuceneIndexingDriver.cs
blobb4114527d7e97201e34ae20f4b42ff566bf59260
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 object flush_lock = new object ();
57 public LuceneIndexingDriver (string index_name, int minor_version, bool build_usercache)
58 : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
65 if (build_usercache)
66 text_cache = TextCache.UserCache;
69 public LuceneIndexingDriver (string index_name, int minor_version)
70 : this (index_name, minor_version, true) { }
72 public LuceneIndexingDriver (string index_name, bool build_usercache)
73 : this (index_name, 0, build_usercache) { }
75 public LuceneIndexingDriver (string index_name)
76 : this (index_name, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt [] Flush (IndexerRequest request)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
88 lock (flush_lock)
89 return Flush_Unlocked (request);
92 private IndexerReceipt [] Flush_Unlocked (IndexerRequest request)
94 ArrayList receipt_queue;
95 receipt_queue = new ArrayList ();
97 IndexReader primary_reader, secondary_reader;
98 primary_reader = IndexReader.Open (PrimaryStore);
99 secondary_reader = IndexReader.Open (SecondaryStore);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS.BooleanQuery prop_change_query = null;
109 int delete_count = 0;
111 foreach (Indexable indexable in request.Indexables) {
113 switch (indexable.Type) {
115 case IndexableType.Add:
116 case IndexableType.Remove:
118 string uri_str;
119 uri_str = UriFu.UriToSerializableString (indexable.Uri);
121 Logger.Log.Debug ("-{0}", indexable.DisplayUri);
123 Term term;
124 term = new Term ("Uri", uri_str);
125 delete_count += primary_reader.Delete (term);
126 if (secondary_reader != null)
127 secondary_reader.Delete (term);
129 // When we delete an indexable, also delete any children.
130 // FIXME: Shouldn't we also delete any children of children, etc.?
131 term = new Term ("ParentUri", uri_str);
132 delete_count += primary_reader.Delete (term);
133 if (secondary_reader != null)
134 secondary_reader.Delete (term);
136 // If this is a strict removal (and not a deletion that
137 // we are doing in anticipation of adding something back),
138 // queue up a removed receipt.
139 if (indexable.Type == IndexableType.Remove) {
140 IndexerRemovedReceipt r;
141 r = new IndexerRemovedReceipt (indexable.Uri);
142 receipt_queue.Add (r);
145 break;
147 case IndexableType.PropertyChange:
148 if (prop_change_query == null)
149 prop_change_query = new LNS.BooleanQuery ();
150 prop_change_query.Add (UriQuery ("Uri", indexable.Uri), false, false);
151 break;
155 if (HaveItemCount)
156 AdjustItemCount (-delete_count);
157 else
158 SetItemCount (primary_reader);
160 // Step #2: If we have are doing any property changes,
161 // we read in the current secondary documents and
162 // store them in a hash table for use later. Then we
163 // delete the current secondary documents.
164 Hashtable prop_change_docs = null;
165 if (prop_change_query != null) {
166 prop_change_docs = UriFu.NewHashtable ();
168 LNS.IndexSearcher secondary_searcher;
169 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
171 LNS.Hits hits;
172 hits = secondary_searcher.Search (prop_change_query);
174 ArrayList delete_terms;
175 delete_terms = new ArrayList ();
177 int N = hits.Length ();
178 for (int i = 0; i < N; ++i) {
179 Document doc;
180 doc = hits.Doc (i);
182 string uri_str;
183 uri_str = doc.Get ("Uri");
185 Uri uri;
186 uri = UriFu.UriStringToUri (uri_str);
187 prop_change_docs [uri] = doc;
189 Term term;
190 term = new Term ("Uri", uri_str);
191 delete_terms.Add (term);
194 secondary_searcher.Close ();
196 foreach (Term term in delete_terms)
197 secondary_reader.Delete (term);
200 // We are now done with the readers, so we close them.
201 primary_reader.Close ();
202 secondary_reader.Close ();
204 // FIXME: If we crash at exactly this point, we are in
205 // trouble. Items will have been dropped from the index
206 // without the proper replacements being added.
208 // Step #3: Make another pass across our list of indexables
209 // and write out any new documents.
211 if (text_cache != null)
212 text_cache.BeginTransaction ();
214 IndexWriter primary_writer, secondary_writer;
215 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
216 secondary_writer = null;
218 foreach (Indexable indexable in request.Indexables) {
220 if (indexable.Type == IndexableType.Remove)
221 continue;
223 IndexerAddedReceipt r;
224 r = new IndexerAddedReceipt (indexable.Uri);
225 receipt_queue.Add (r);
227 if (indexable.Type == IndexableType.PropertyChange) {
229 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
230 r.PropertyChangesOnly = true;
232 Document doc;
233 doc = prop_change_docs [indexable.Uri] as Document;
235 Document new_doc;
236 new_doc = RewriteDocument (doc, indexable);
238 // Write out the new document...
239 if (secondary_writer == null)
240 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
241 secondary_writer.AddDocument (new_doc);
243 continue; // ...and proceed to the next Indexable
246 // If we reach this point we know we are dealing with an IndexableType.Add
248 if (indexable.Type != IndexableType.Add)
249 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
251 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
253 Filter filter = null;
255 // If we have content, try to find a filter
256 // which we can use to process the indexable.
257 try {
258 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
259 } catch (Exception e) {
260 Logger.Log.Error ("Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
261 Logger.Log.Error (e);
262 indexable.NoContent = true;
265 Document primary_doc = null, secondary_doc = null;
267 try {
268 BuildDocuments (indexable, out primary_doc, out secondary_doc);
269 primary_writer.AddDocument (primary_doc);
270 } catch (Exception ex) {
272 // If an exception was thrown, something bad probably happened
273 // while we were filtering the content. Set NoContent to true
274 // and try again -- that way it will at least end up in the index,
275 // even if we don't manage to extract the fulltext.
277 Logger.Log.Debug ("First attempt to index {0} failed", indexable.DisplayUri);
278 Logger.Log.Debug (ex);
280 indexable.NoContent = true;
282 try {
283 BuildDocuments (indexable, out primary_doc, out secondary_doc);
284 primary_writer.AddDocument (primary_doc);
285 } catch (Exception ex2) {
286 Logger.Log.Debug ("Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
287 Logger.Log.Debug (ex2);
291 if (filter != null) {
293 // Force the clean-up of temporary files, just in case.
294 filter.Cleanup ();
296 r.FilterName = filter.GetType ().ToString ();
297 r.FilterVersion = filter.Version;
299 // Create a receipt containing any child indexables.
300 if (filter.ChildIndexables.Count > 0) {
301 IndexerChildIndexablesReceipt cr;
302 cr = new IndexerChildIndexablesReceipt (indexable, filter.ChildIndexables);
303 receipt_queue.Add (cr);
307 if (secondary_doc != null) {
308 if (secondary_writer == null)
309 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
311 secondary_writer.AddDocument (secondary_doc);
314 AdjustItemCount (1);
316 // Clean up any temporary files associated with filtering this indexable.
317 indexable.Cleanup ();
320 if (text_cache != null)
321 text_cache.CommitTransaction ();
323 if (request.OptimizeIndex) {
324 Stopwatch watch = new Stopwatch ();
325 Logger.Log.Debug ("Optimizing {0}", IndexName);
326 watch.Start ();
327 primary_writer.Optimize ();
328 if (secondary_writer == null)
329 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
330 secondary_writer.Optimize ();
331 watch.Stop ();
332 Logger.Log.Debug ("{0} optimized in {1}", IndexName, watch);
335 // Step #4. Close our writers and return the events to
336 // indicate what has happened.
338 primary_writer.Close ();
339 if (secondary_writer != null)
340 secondary_writer.Close ();
342 IndexerReceipt [] receipt_array;
343 receipt_array = new IndexerReceipt [receipt_queue.Count];
344 for (int i = 0; i < receipt_queue.Count; ++i)
345 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
347 return receipt_array;
350 ////////////////////////////////////////////////////////////////
352 public void OptimizeNow ()
354 IndexWriter writer;
356 writer = new IndexWriter (PrimaryStore, null, false);
357 writer.Optimize ();
358 writer.Close ();
360 if (SecondaryStore != null) {
361 writer = new IndexWriter (SecondaryStore, null, false);
362 writer.Optimize ();
363 writer.Close ();
368 public void Merge (LuceneCommon index_to_merge)
370 // FIXME: Error recovery
372 // Merge the primary index
373 IndexWriter primary_writer;
374 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
375 primary_writer = new IndexWriter (PrimaryStore, null, false);
377 primary_writer.AddIndexes (primary_store);
378 primary_writer.Close ();
380 // Merge the secondary index
381 IndexWriter secondary_writer;
382 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
383 secondary_writer = new IndexWriter (SecondaryStore, null, false);
385 secondary_writer.AddIndexes (secondary_store);
386 secondary_writer.Close ();