2 // LuceneIndexingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
33 using System
.Collections
;
34 using System
.Diagnostics
;
35 using System
.Globalization
;
38 using System
.Threading
;
40 using System
.Xml
.Serialization
;
42 using Lucene
.Net
.Analysis
;
43 using Lucene
.Net
.Analysis
.Standard
;
44 using Lucene
.Net
.Documents
;
45 using Lucene
.Net
.Index
;
46 using Lucene
.Net
.QueryParsers
;
47 using LNS
= Lucene
.Net
.Search
;
51 namespace Beagle
.Daemon
{
53 public class LuceneIndexingDriver
: LuceneCommon
, IIndexer
{
55 object flush_lock
= new object ();
57 public LuceneIndexingDriver (string index_name
, int minor_version
, bool build_usercache
)
58 : base (index_name
, minor_version
)
66 text_cache
= TextCache
.UserCache
;
69 public LuceneIndexingDriver (string index_name
, int minor_version
)
70 : this (index_name
, minor_version
, true) { }
72 public LuceneIndexingDriver (string index_name
, bool build_usercache
)
73 : this (index_name
, 0, build_usercache
) { }
75 public LuceneIndexingDriver (string index_name
)
76 : this (index_name
, 0, true) { }
78 ////////////////////////////////////////////////////////////////
80 // We use this in the index helper so that we can report what's
81 // going on if the helper spins the CPU. The method will be
82 // called with null parameters after filtering has finished.
84 public delegate void FileFilterDelegate (Uri display_uri
, Filter filter
);
85 public FileFilterDelegate FileFilterNotifier
= null;
87 ////////////////////////////////////////////////////////////////
90 // Implementation of the IIndexer interface
93 public IndexerReceipt
[] Flush (IndexerRequest request
)
95 // This is just to keep a big block of code from being
96 // indented an extra eight spaces.
98 return Flush_Unlocked (request
);
101 private IndexerReceipt
[] Flush_Unlocked (IndexerRequest request
)
103 ArrayList receipt_queue
;
104 receipt_queue
= new ArrayList ();
106 IndexReader primary_reader
, secondary_reader
;
107 primary_reader
= IndexReader
.Open (PrimaryStore
);
108 secondary_reader
= IndexReader
.Open (SecondaryStore
);
110 // Step #1: Make our first pass over the list of
111 // indexables that make up our request. For each add
112 // or remove in the request, delete the associated
113 // items from the index. Assemble a query that will
114 // be used to find the secondary documents for any
115 // property change requests.
117 LNS
.BooleanQuery prop_change_query
= null;
118 LNS
.BooleanQuery prop_change_children_query
= null;
119 int delete_count
= 0;
121 ICollection request_indexables
= request
.Indexables
;
123 foreach (Indexable indexable
in request_indexables
) {
125 switch (indexable
.Type
) {
127 case IndexableType
.Add
:
128 case IndexableType
.Remove
:
131 uri_str
= UriFu
.UriToEscapedString (indexable
.Uri
);
133 Logger
.Log
.Debug ("-{0}", indexable
.DisplayUri
);
136 term
= new Term ("Uri", uri_str
);
137 delete_count
+= primary_reader
.Delete (term
);
138 if (secondary_reader
!= null)
139 secondary_reader
.Delete (term
);
141 // When we delete an indexable, also delete any children.
142 // FIXME: Shouldn't we also delete any children of children, etc.?
143 term
= new Term ("ParentUri", uri_str
);
144 delete_count
+= primary_reader
.Delete (term
);
145 if (secondary_reader
!= null)
146 secondary_reader
.Delete (term
);
148 // If this is a strict removal (and not a deletion that
149 // we are doing in anticipation of adding something back),
150 // queue up a removed receipt.
151 if (indexable
.Type
== IndexableType
.Remove
) {
152 IndexerRemovedReceipt r
;
153 r
= new IndexerRemovedReceipt (indexable
.Uri
);
154 receipt_queue
.Add (r
);
159 case IndexableType
.PropertyChange
:
160 if (prop_change_query
== null) {
161 prop_change_query
= new LNS
.BooleanQuery ();
162 prop_change_children_query
= new LNS
.BooleanQuery ();
165 prop_change_query
.Add (UriQuery ("Uri", indexable
.Uri
), false, false);
166 prop_change_children_query
.Add (UriQuery ("ParentUri", indexable
.Uri
), false, false);
172 AdjustItemCount (-delete_count
);
174 SetItemCount (primary_reader
);
176 // Step #2: If we have are doing any property changes,
177 // we read in the current secondary documents and
178 // store them in a hash table for use later. Then we
179 // delete the current secondary documents.
180 Hashtable prop_change_docs
= null;
181 Hashtable prop_change_children_docs
= null;
182 if (prop_change_query
!= null) {
183 prop_change_docs
= UriFu
.NewHashtable ();
185 LNS
.IndexSearcher secondary_searcher
;
186 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
189 hits
= secondary_searcher
.Search (prop_change_query
);
191 ArrayList delete_terms
;
192 delete_terms
= new ArrayList ();
194 int N
= hits
.Length ();
196 for (int i
= 0; i
< N
; ++i
) {
200 uri_str
= doc
.Get ("Uri");
203 uri
= UriFu
.EscapedStringToUri (uri_str
);
204 prop_change_docs
[uri
] = doc
;
207 term
= new Term ("Uri", uri_str
);
208 delete_terms
.Add (term
);
211 secondary_searcher
.Close ();
213 foreach (Term term
in delete_terms
)
214 secondary_reader
.Delete (term
);
216 // Step #2.5: Find all child indexables for this document
217 // Store them to send them later as IndexerChildIndexablesReceipts
218 prop_change_children_docs
= UriFu
.NewHashtable ();
220 hits
= secondary_searcher
.Search (prop_change_children_query
);
223 for (int i
= 0; i
< N
; ++i
) {
226 string uri_str
, parent_uri_str
;
227 uri_str
= doc
.Get ("Uri");
228 parent_uri_str
= doc
.Get ("ParentUri");
231 uri
= UriFu
.EscapedStringToUri (uri_str
);
232 parent_uri
= UriFu
.EscapedStringToUri (parent_uri_str
);
234 if (! prop_change_children_docs
.Contains (parent_uri
)) {
235 ArrayList c_list
= new ArrayList ();
236 prop_change_children_docs
[parent_uri
] = c_list
;
239 ArrayList children_list
= (ArrayList
) prop_change_children_docs
[parent_uri
];
240 children_list
.Add (uri
);
243 secondary_searcher
.Close ();
247 // We are now done with the readers, so we close them.
248 primary_reader
.Close ();
249 secondary_reader
.Close ();
251 // FIXME: If we crash at exactly this point, we are in
252 // trouble. Items will have been dropped from the index
253 // without the proper replacements being added.
255 // Step #3: Make another pass across our list of indexables
256 // and write out any new documents.
258 if (text_cache
!= null)
259 text_cache
.BeginTransaction ();
261 IndexWriter primary_writer
, secondary_writer
;
262 primary_writer
= new IndexWriter (PrimaryStore
, IndexingAnalyzer
, false);
263 secondary_writer
= null;
265 foreach (Indexable indexable
in request_indexables
) {
267 if (indexable
.Type
== IndexableType
.Remove
)
270 IndexerAddedReceipt r
;
271 r
= new IndexerAddedReceipt (indexable
.Uri
);
272 receipt_queue
.Add (r
);
274 if (indexable
.Type
== IndexableType
.PropertyChange
) {
276 Logger
.Log
.Debug ("+{0} (props only)", indexable
.DisplayUri
);
277 r
.PropertyChangesOnly
= true;
280 doc
= prop_change_docs
[indexable
.Uri
] as Document
;
283 new_doc
= RewriteDocument (doc
, indexable
);
285 // Write out the new document...
286 if (secondary_writer
== null)
287 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
288 secondary_writer
.AddDocument (new_doc
);
290 // Add children property change indexables...
291 AddChildrenPropertyChange (
292 prop_change_children_docs
,
296 continue; // ...and proceed to the next Indexable
299 // If we reach this point we know we are dealing with an IndexableType.Add
301 if (indexable
.Type
!= IndexableType
.Add
)
302 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
304 Logger
.Log
.Debug ("+{0}", indexable
.DisplayUri
);
306 Filter filter
= null;
308 if (FileFilterNotifier
!= null)
309 FileFilterNotifier (indexable
.DisplayUri
, null); // We don't know what filter yet.
311 // If we have content, try to find a filter
312 // which we can use to process the indexable.
314 FilterFactory
.FilterIndexable (indexable
, text_cache
, out filter
);
315 } catch (Exception e
) {
316 Logger
.Log
.Error (e
, "Unable to filter {0} (mimetype={1})", indexable
.DisplayUri
, indexable
.MimeType
);
317 indexable
.NoContent
= true;
320 if (FileFilterNotifier
!= null)
321 FileFilterNotifier (indexable
.DisplayUri
, filter
); // Update with our filter
323 Document primary_doc
= null, secondary_doc
= null;
326 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
327 primary_writer
.AddDocument (primary_doc
);
328 } catch (Exception ex
) {
330 // If an exception was thrown, something bad probably happened
331 // while we were filtering the content. Set NoContent to true
332 // and try again -- that way it will at least end up in the index,
333 // even if we don't manage to extract the fulltext.
335 Logger
.Log
.Debug (ex
, "First attempt to index {0} failed", indexable
.DisplayUri
);
337 indexable
.NoContent
= true;
340 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
341 primary_writer
.AddDocument (primary_doc
);
342 } catch (Exception ex2
) {
343 Logger
.Log
.Debug (ex2
, "Second attempt to index {0} failed, giving up...", indexable
.DisplayUri
);
347 if (filter
!= null) {
349 // Force the clean-up of temporary files, just in case.
352 r
.FilterName
= filter
.GetType ().ToString ();
353 r
.FilterVersion
= filter
.Version
;
355 // Create a receipt containing any child indexables.
356 if (filter
.ChildIndexables
.Count
> 0) {
357 Log
.Debug ("Generated {0} child indexable{1} from {2} (filtered with {3})", filter
.ChildIndexables
.Count
, filter
.ChildIndexables
.Count
> 1 ? "s" : "", indexable
.DisplayUri
, r
.FilterName
);
358 IndexerChildIndexablesReceipt cr
;
359 cr
= new IndexerChildIndexablesReceipt (indexable
, filter
.ChildIndexables
);
360 receipt_queue
.Add (cr
);
364 if (FileFilterNotifier
!= null)
365 FileFilterNotifier (null, null); // reset
367 if (secondary_doc
!= null) {
368 if (secondary_writer
== null)
369 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
371 secondary_writer
.AddDocument (secondary_doc
);
376 // Clean up any temporary files associated with filtering this indexable.
377 indexable
.Cleanup ();
380 if (text_cache
!= null)
381 text_cache
.CommitTransaction ();
383 if (request
.OptimizeIndex
) {
384 Stopwatch watch
= new Stopwatch ();
385 Logger
.Log
.Debug ("Optimizing {0}", IndexName
);
387 primary_writer
.Optimize ();
388 if (secondary_writer
== null)
389 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
390 secondary_writer
.Optimize ();
392 Logger
.Log
.Debug ("{0} optimized in {1}", IndexName
, watch
);
395 // Step #4. Close our writers and return the events to
396 // indicate what has happened.
398 primary_writer
.Close ();
399 if (secondary_writer
!= null)
400 secondary_writer
.Close ();
402 IndexerReceipt
[] receipt_array
;
403 receipt_array
= new IndexerReceipt
[receipt_queue
.Count
];
404 for (int i
= 0; i
< receipt_queue
.Count
; ++i
)
405 receipt_array
[i
] = (IndexerReceipt
) receipt_queue
[i
];
407 return receipt_array
;
410 // Since some parent properties maybe stored in child properties
411 // as parent: property, any property change should be propagated
412 // to all its children as well.
413 private void AddChildrenPropertyChange (
414 Hashtable children_docs
,
416 ArrayList receipt_queue
)
418 if (! children_docs
.Contains (parent
.Uri
))
421 ArrayList children_list
= (ArrayList
) children_docs
[parent
.Uri
];
422 IndexerChildIndexablesReceipt child_r
;
423 child_r
= new IndexerChildIndexablesReceipt ();
424 ArrayList child_indexable_list
= new ArrayList ();
426 foreach (Uri uri
in children_list
) {
427 Indexable child_indexable
;
428 child_indexable
= new Indexable (IndexableType
.PropertyChange
, uri
);
429 Log
.Debug ("Creating property change child indexable for {1} (parent {0})", parent
.Uri
, uri
);
431 child_indexable
.SetChildOf (parent
);
432 child_indexable_list
.Add (child_indexable
);
435 child_r
.Children
= child_indexable_list
;
436 receipt_queue
.Add (child_r
);
439 ////////////////////////////////////////////////////////////////
441 public void OptimizeNow ()
445 writer
= new IndexWriter (PrimaryStore
, null, false);
449 if (SecondaryStore
!= null) {
450 writer
= new IndexWriter (SecondaryStore
, null, false);
457 public void Merge (LuceneCommon index_to_merge
)
459 // FIXME: Error recovery
461 // Merge the primary index
462 IndexWriter primary_writer
;
463 Lucene
.Net
.Store
.Directory
[] primary_store
= {index_to_merge.PrimaryStore}
;
464 primary_writer
= new IndexWriter (PrimaryStore
, null, false);
466 primary_writer
.AddIndexes (primary_store
);
467 primary_writer
.Close ();
469 // Merge the secondary index
470 IndexWriter secondary_writer
;
471 Lucene
.Net
.Store
.Directory
[] secondary_store
= {index_to_merge.SecondaryStore}
;
472 secondary_writer
= new IndexWriter (SecondaryStore
, null, false);
474 secondary_writer
.AddIndexes (secondary_store
);
475 secondary_writer
.Close ();