Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / LuceneIndexingDriver.cs
blobebbed3d84b2c294595a9557ff278a8a500676545
1 //
2 // LuceneIndexingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
32 using System;
33 using System.Collections;
34 using System.Diagnostics;
35 using System.Globalization;
36 using System.IO;
37 using System.Text;
38 using System.Threading;
39 using System.Xml;
40 using System.Xml.Serialization;
42 using Lucene.Net.Analysis;
43 using Lucene.Net.Analysis.Standard;
44 using Lucene.Net.Documents;
45 using Lucene.Net.Index;
46 using Lucene.Net.QueryParsers;
47 using LNS = Lucene.Net.Search;
49 using Beagle.Util;
51 namespace Beagle.Daemon {
53 public class LuceneIndexingDriver : LuceneCommon, IIndexer {
55 Hashtable pending_by_uri = UriFu.NewHashtable ();
56 bool optimize_during_next_flush = false;
58 public LuceneIndexingDriver (string index_name, int minor_version) : base (index_name, minor_version)
60 if (Exists ())
61 Open ();
62 else
63 Create ();
66 public LuceneIndexingDriver (string index_name) : this (index_name, 0)
67 { }
69 ////////////////////////////////////////////////////////////////
72 // Implementation of the IIndexer interface
75 public void Add (Indexable indexable)
77 lock (pending_by_uri) {
78 Indexable existing_indexable;
79 existing_indexable = pending_by_uri [indexable.Uri] as Indexable;
81 // If we already have an Indexable queued up and this is a property-change
82 // only Indexable, just change the original Indexable's properties.
83 if (existing_indexable != null && indexable.PropertyChangesOnly) {
84 existing_indexable.MergeProperties (indexable);
85 return;
88 pending_by_uri [indexable.Uri] = indexable;
92 public void Remove (Uri uri)
94 lock (pending_by_uri) {
95 pending_by_uri [uri] = null;
99 public void Optimize ()
101 optimize_during_next_flush = true;
104 public IndexerReceipt [] FlushAndBlock ()
106 ArrayList receipt_queue;
108 lock (pending_by_uri) {
110 receipt_queue = new ArrayList ();
112 // Step #1: Delete all items with the same URIs
113 // as our pending items from the index.
115 IndexReader primary_reader, secondary_reader;
116 primary_reader = IndexReader.Open (PrimaryStore);
117 secondary_reader = IndexReader.Open (SecondaryStore);
119 LNS.BooleanQuery prop_change_query = null;
121 int delete_count = 0;
123 foreach (DictionaryEntry entry in pending_by_uri) {
124 Uri uri = entry.Key as Uri;
125 Indexable indexable = entry.Value as Indexable;
127 // If this indexable only contains property changes,
128 // all we do at this point is assemble the query that we will
129 // use to retrieve the current property values. We'll ultimately
130 // need to delete the existing secondary documents, but not
131 // until we've loaded them...
132 if (indexable != null && indexable.PropertyChangesOnly) {
133 if (prop_change_query == null)
134 prop_change_query = new LNS.BooleanQuery ();
135 prop_change_query.Add (UriQuery ("Uri", uri), false, false);
136 continue;
139 Logger.Log.Debug ("-{0}", uri);
141 Term term;
142 term = new Term ("Uri", UriFu.UriToSerializableString (uri));
143 delete_count += primary_reader.Delete (term);
144 if (secondary_reader != null)
145 secondary_reader.Delete (term);
147 // When we delete an indexable, also delete any children.
148 // FIXME: Shouldn't we also delete any children of children, etc.?
149 term = new Term ("ParentUri", UriFu.UriToSerializableString (uri));
150 delete_count += primary_reader.Delete (term);
151 if (secondary_reader != null)
152 secondary_reader.Delete (term);
154 // If this is a strict removal (and not a deletion that
155 // we are doing in anticipation of adding something back),
156 // queue up a removed event.
157 if (indexable == null) {
158 IndexerRemovedReceipt r;
159 r = new IndexerRemovedReceipt (uri);
160 receipt_queue.Add (r);
164 if (HaveItemCount)
165 AdjustItemCount (-delete_count);
166 else
167 SetItemCount (primary_reader);
169 // If we have are doing any property changes,
170 // we read in the current secondary documents
171 // and store them in a hash table for use
172 // later. Then we delete the current
173 // secondary documents.
174 Hashtable current_docs = null;
175 if (prop_change_query != null) {
176 current_docs = UriFu.NewHashtable ();
178 LNS.IndexSearcher secondary_searcher;
179 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
181 LNS.Hits hits;
182 hits = secondary_searcher.Search (prop_change_query);
184 ArrayList delete_terms;
185 delete_terms = new ArrayList ();
187 int N;
188 N = hits.Length ();
189 for (int i = 0; i < N; ++i) {
190 Document doc;
191 doc = hits.Doc (i);
193 Uri doc_uri;
194 doc_uri = GetUriFromDocument (doc);
196 current_docs [doc_uri] = doc;
198 Term term;
199 term = new Term ("Uri", UriFu.UriToSerializableString (doc_uri));
200 delete_terms.Add (term);
203 secondary_searcher.Close ();
205 foreach (Term term in delete_terms)
206 secondary_reader.Delete (term);
209 // FIXME: Would we gain more "transactionality" if we didn't close
210 // the readers until later? Would that even be possible, or will
211 // it create locking problems?
212 primary_reader.Close ();
213 secondary_reader.Close ();
216 // Step #2: Write out the pending adds.
218 if (text_cache != null)
219 text_cache.BeginTransaction ();
221 IndexWriter primary_writer, secondary_writer;
222 primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
223 secondary_writer = null;
225 foreach (Indexable indexable in pending_by_uri.Values) {
227 if (indexable == null)
228 continue;
230 IndexerAddedReceipt r;
231 r = new IndexerAddedReceipt (indexable.Uri);
232 r.Properties = indexable.Properties;
234 // Handle property changes
235 if (indexable.PropertyChangesOnly) {
236 Logger.Log.Debug ("+{0} (props only)", indexable.DisplayUri);
238 Document current_doc;
239 current_doc = current_docs [indexable.Uri] as Document;
241 Document new_doc;
242 new_doc = RewriteDocument (current_doc, indexable);
244 // Write out the new document...
245 if (secondary_writer == null)
246 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
247 secondary_writer.AddDocument (new_doc);
249 r.PropertyChangesOnly = true;
250 receipt_queue.Add (r);
252 continue; // ...and proceed to the next Indexable
255 Logger.Log.Debug ("+{0}", indexable.DisplayUri);
257 Filter filter = null;
259 try {
260 FilterFactory.FilterIndexable (indexable, text_cache, out filter);
261 } catch (Exception e) {
262 Logger.Log.Error ("Unable to filter {0} (mimetype={1})", indexable.DisplayUri, indexable.MimeType);
263 Logger.Log.Error (e);
264 indexable.NoContent = true;
267 Document primary_doc = null, secondary_doc = null;
269 try {
270 BuildDocuments (indexable, out primary_doc, out secondary_doc);
271 primary_writer.AddDocument (primary_doc);
272 } catch (Exception ex) {
274 // If an exception was thrown, something bad probably happened
275 // while we were filtering the content. Set NoContent to true
276 // and try again.
278 Logger.Log.Debug ("First attempt to index {0} failed", indexable.DisplayUri);
279 Logger.Log.Debug (ex);
281 indexable.NoContent = true;
283 try {
284 BuildDocuments (indexable, out primary_doc, out secondary_doc);
285 primary_writer.AddDocument (primary_doc);
286 } catch (Exception ex2) {
287 Logger.Log.Debug ("Second attempt to index {0} failed, giving up...", indexable.DisplayUri);
288 Logger.Log.Debug (ex2);
292 if (filter != null) {
293 r.FilterName = filter.GetType ().ToString ();
294 r.FilterVersion = filter.Version;
297 receipt_queue.Add (r);
299 if (secondary_doc != null) {
300 if (secondary_writer == null)
301 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
303 secondary_writer.AddDocument (secondary_doc);
306 AdjustItemCount (1);
309 if (text_cache != null)
310 text_cache.CommitTransaction ();
312 if (optimize_during_next_flush) {
313 Logger.Log.Debug ("Optimizing");
314 primary_writer.Optimize ();
315 if (secondary_writer == null)
316 secondary_writer = new IndexWriter (SecondaryStore, IndexingAnalyzer, false);
317 secondary_writer.Optimize ();
318 optimize_during_next_flush = false;
321 // Step #3. Close our writers and return the events to
322 // indicate what has happened.
324 primary_writer.Close ();
325 if (secondary_writer != null)
326 secondary_writer.Close ();
328 pending_by_uri.Clear ();
330 IndexerReceipt [] receipt_array;
331 receipt_array = new IndexerReceipt [receipt_queue.Count];
332 for (int i = 0; i < receipt_queue.Count; ++i)
333 receipt_array [i] = (IndexerReceipt) receipt_queue [i];
335 return receipt_array;
339 public void Flush ()
341 // FIXME: Right now we don't support a non-blocking flush,
342 // but it would be easy enough to do it in a thread.
344 IndexerReceipt [] receipts;
346 receipts = FlushAndBlock ();
348 if (FlushEvent != null) {
349 if (receipts != null)
350 FlushEvent (this, receipts); // this returns the receipts to anyone who cares
351 FlushEvent (this, null); // and this indicates that we are all done
356 public event IIndexerFlushHandler FlushEvent;
358 ////////////////////////////////////////////////////////////////
360 public void OptimizeNow ()
362 IndexWriter writer;
364 writer = new IndexWriter (PrimaryStore, null, false);
365 writer.Optimize ();
366 writer.Close ();
368 if (SecondaryStore != null) {
369 writer = new IndexWriter (SecondaryStore, null, false);
370 writer.Optimize ();
371 writer.Close ();
375 public void Merge (LuceneCommon index_to_merge)
377 // FIXME: Error recovery
379 // Merge the primary index
380 IndexWriter primary_writer;
381 Lucene.Net.Store.Directory[] primary_store = {index_to_merge.PrimaryStore};
382 primary_writer = new IndexWriter (PrimaryStore, null, false);
384 primary_writer.AddIndexes (primary_store);
385 primary_writer.Close ();
387 // Merge the secondary index
388 IndexWriter secondary_writer;
389 Lucene.Net.Store.Directory[] secondary_store = {index_to_merge.SecondaryStore};
390 secondary_writer = new IndexWriter (SecondaryStore, null, false);
392 secondary_writer.AddIndexes (secondary_store);
393 secondary_writer.Close ();