2 // LuceneIndexingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
33 using System
.Collections
;
34 using System
.Diagnostics
;
35 using System
.Globalization
;
38 using System
.Threading
;
40 using System
.Xml
.Serialization
;
42 using Lucene
.Net
.Analysis
;
43 using Lucene
.Net
.Analysis
.Standard
;
44 using Lucene
.Net
.Documents
;
45 using Lucene
.Net
.Index
;
46 using Lucene
.Net
.QueryParsers
;
47 using LNS
= Lucene
.Net
.Search
;
51 namespace Beagle
.Daemon
{
53 public class LuceneIndexingDriver
: LuceneCommon
, IIndexer
{
55 object flush_lock
= new object ();
57 public LuceneIndexingDriver (string index_name
, int minor_version
, bool build_usercache
)
58 : base (index_name
, minor_version
)
66 text_cache
= TextCache
.UserCache
;
69 public LuceneIndexingDriver (string index_name
, int minor_version
)
70 : this (index_name
, minor_version
, true) { }
72 public LuceneIndexingDriver (string index_name
, bool build_usercache
)
73 : this (index_name
, 0, build_usercache
) { }
75 public LuceneIndexingDriver (string index_name
)
76 : this (index_name
, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt
[] Flush (IndexerRequest request
)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
89 return Flush_Unlocked (request
);
92 private IndexerReceipt
[] Flush_Unlocked (IndexerRequest request
)
94 ArrayList receipt_queue
;
95 receipt_queue
= new ArrayList ();
97 IndexReader primary_reader
, secondary_reader
;
98 primary_reader
= IndexReader
.Open (PrimaryStore
);
99 secondary_reader
= IndexReader
.Open (SecondaryStore
);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS
.BooleanQuery prop_change_query
= null;
109 LNS
.BooleanQuery prop_change_children_query
= null;
110 int delete_count
= 0;
112 ICollection request_indexables
= request
.Indexables
;
114 foreach (Indexable indexable
in request_indexables
) {
116 switch (indexable
.Type
) {
118 case IndexableType
.Add
:
119 case IndexableType
.Remove
:
122 uri_str
= UriFu
.UriToEscapedString (indexable
.Uri
);
124 Logger
.Log
.Debug ("-{0}", indexable
.DisplayUri
);
127 term
= new Term ("Uri", uri_str
);
128 delete_count
+= primary_reader
.Delete (term
);
129 if (secondary_reader
!= null)
130 secondary_reader
.Delete (term
);
132 // When we delete an indexable, also delete any children.
133 // FIXME: Shouldn't we also delete any children of children, etc.?
134 term
= new Term ("ParentUri", uri_str
);
135 delete_count
+= primary_reader
.Delete (term
);
136 if (secondary_reader
!= null)
137 secondary_reader
.Delete (term
);
139 // If this is a strict removal (and not a deletion that
140 // we are doing in anticipation of adding something back),
141 // queue up a removed receipt.
142 if (indexable
.Type
== IndexableType
.Remove
) {
143 IndexerRemovedReceipt r
;
144 r
= new IndexerRemovedReceipt (indexable
.Uri
);
145 receipt_queue
.Add (r
);
150 case IndexableType
.PropertyChange
:
151 if (prop_change_query
== null) {
152 prop_change_query
= new LNS
.BooleanQuery ();
153 prop_change_children_query
= new LNS
.BooleanQuery ();
156 prop_change_query
.Add (UriQuery ("Uri", indexable
.Uri
), false, false);
157 prop_change_children_query
.Add (UriQuery ("ParentUri", indexable
.Uri
), false, false);
163 AdjustItemCount (-delete_count
);
165 SetItemCount (primary_reader
);
167 // Step #2: If we have are doing any property changes,
168 // we read in the current secondary documents and
169 // store them in a hash table for use later. Then we
170 // delete the current secondary documents.
171 Hashtable prop_change_docs
= null;
172 Hashtable prop_change_children_docs
= null;
173 if (prop_change_query
!= null) {
174 prop_change_docs
= UriFu
.NewHashtable ();
176 LNS
.IndexSearcher secondary_searcher
;
177 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
180 hits
= secondary_searcher
.Search (prop_change_query
);
182 ArrayList delete_terms
;
183 delete_terms
= new ArrayList ();
185 int N
= hits
.Length ();
187 for (int i
= 0; i
< N
; ++i
) {
191 uri_str
= doc
.Get ("Uri");
194 uri
= UriFu
.EscapedStringToUri (uri_str
);
195 prop_change_docs
[uri
] = doc
;
198 term
= new Term ("Uri", uri_str
);
199 delete_terms
.Add (term
);
202 secondary_searcher
.Close ();
204 foreach (Term term
in delete_terms
)
205 secondary_reader
.Delete (term
);
207 // Step #2.5: Find all child indexables for this document
208 // Store them to send them later as IndexerChildIndexablesReceipts
209 prop_change_children_docs
= UriFu
.NewHashtable ();
211 hits
= secondary_searcher
.Search (prop_change_children_query
);
214 for (int i
= 0; i
< N
; ++i
) {
217 string uri_str
, parent_uri_str
;
218 uri_str
= doc
.Get ("Uri");
219 parent_uri_str
= doc
.Get ("ParentUri");
222 uri
= UriFu
.EscapedStringToUri (uri_str
);
223 parent_uri
= UriFu
.EscapedStringToUri (parent_uri_str
);
225 if (! prop_change_children_docs
.Contains (parent_uri
)) {
226 ArrayList c_list
= new ArrayList ();
227 prop_change_children_docs
[parent_uri
] = c_list
;
230 ArrayList children_list
= (ArrayList
) prop_change_children_docs
[parent_uri
];
231 children_list
.Add (uri
);
234 secondary_searcher
.Close ();
238 // We are now done with the readers, so we close them.
239 primary_reader
.Close ();
240 secondary_reader
.Close ();
242 // FIXME: If we crash at exactly this point, we are in
243 // trouble. Items will have been dropped from the index
244 // without the proper replacements being added.
246 // Step #3: Make another pass across our list of indexables
247 // and write out any new documents.
249 if (text_cache
!= null)
250 text_cache
.BeginTransaction ();
252 IndexWriter primary_writer
, secondary_writer
;
253 primary_writer
= new IndexWriter (PrimaryStore
, IndexingAnalyzer
, false);
254 secondary_writer
= null;
256 foreach (Indexable indexable
in request_indexables
) {
258 if (indexable
.Type
== IndexableType
.Remove
)
261 IndexerAddedReceipt r
;
262 r
= new IndexerAddedReceipt (indexable
.Uri
);
263 receipt_queue
.Add (r
);
265 if (indexable
.Type
== IndexableType
.PropertyChange
) {
267 Logger
.Log
.Debug ("+{0} (props only)", indexable
.DisplayUri
);
268 r
.PropertyChangesOnly
= true;
271 doc
= prop_change_docs
[indexable
.Uri
] as Document
;
274 new_doc
= RewriteDocument (doc
, indexable
);
276 // Write out the new document...
277 if (secondary_writer
== null)
278 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
279 secondary_writer
.AddDocument (new_doc
);
281 // Add children property change indexables...
282 AddChildrenPropertyChange (
283 prop_change_children_docs
,
287 continue; // ...and proceed to the next Indexable
290 // If we reach this point we know we are dealing with an IndexableType.Add
292 if (indexable
.Type
!= IndexableType
.Add
)
293 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
295 Logger
.Log
.Debug ("+{0}", indexable
.DisplayUri
);
297 Filter filter
= null;
299 // If we have content, try to find a filter
300 // which we can use to process the indexable.
302 FilterFactory
.FilterIndexable (indexable
, text_cache
, out filter
);
303 } catch (Exception e
) {
304 Logger
.Log
.Error (e
, "Unable to filter {0} (mimetype={1})", indexable
.DisplayUri
, indexable
.MimeType
);
305 indexable
.NoContent
= true;
308 Document primary_doc
= null, secondary_doc
= null;
311 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
312 primary_writer
.AddDocument (primary_doc
);
313 } catch (Exception ex
) {
315 // If an exception was thrown, something bad probably happened
316 // while we were filtering the content. Set NoContent to true
317 // and try again -- that way it will at least end up in the index,
318 // even if we don't manage to extract the fulltext.
320 Logger
.Log
.Debug (ex
, "First attempt to index {0} failed", indexable
.DisplayUri
);
322 indexable
.NoContent
= true;
325 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
326 primary_writer
.AddDocument (primary_doc
);
327 } catch (Exception ex2
) {
328 Logger
.Log
.Debug (ex2
, "Second attempt to index {0} failed, giving up...", indexable
.DisplayUri
);
332 if (filter
!= null) {
334 // Force the clean-up of temporary files, just in case.
337 r
.FilterName
= filter
.GetType ().ToString ();
338 r
.FilterVersion
= filter
.Version
;
340 // Create a receipt containing any child indexables.
341 if (filter
.ChildIndexables
.Count
> 0) {
342 Log
.Debug ("Generated {0} child indexable{1} from {2} (filtered with {3})", filter
.ChildIndexables
.Count
, filter
.ChildIndexables
.Count
> 1 ? "s" : "", indexable
.DisplayUri
, r
.FilterName
);
343 IndexerChildIndexablesReceipt cr
;
344 cr
= new IndexerChildIndexablesReceipt (indexable
, filter
.ChildIndexables
);
345 receipt_queue
.Add (cr
);
349 if (secondary_doc
!= null) {
350 if (secondary_writer
== null)
351 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
353 secondary_writer
.AddDocument (secondary_doc
);
358 // Clean up any temporary files associated with filtering this indexable.
359 indexable
.Cleanup ();
362 if (text_cache
!= null)
363 text_cache
.CommitTransaction ();
365 if (request
.OptimizeIndex
) {
366 Stopwatch watch
= new Stopwatch ();
367 Logger
.Log
.Debug ("Optimizing {0}", IndexName
);
369 primary_writer
.Optimize ();
370 if (secondary_writer
== null)
371 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
372 secondary_writer
.Optimize ();
374 Logger
.Log
.Debug ("{0} optimized in {1}", IndexName
, watch
);
377 // Step #4. Close our writers and return the events to
378 // indicate what has happened.
380 primary_writer
.Close ();
381 if (secondary_writer
!= null)
382 secondary_writer
.Close ();
384 IndexerReceipt
[] receipt_array
;
385 receipt_array
= new IndexerReceipt
[receipt_queue
.Count
];
386 for (int i
= 0; i
< receipt_queue
.Count
; ++i
)
387 receipt_array
[i
] = (IndexerReceipt
) receipt_queue
[i
];
389 return receipt_array
;
392 // Since some parent properties maybe stored in child properties
393 // as parent: property, any property change should be propagated
394 // to all its children as well.
395 private void AddChildrenPropertyChange (
396 Hashtable children_docs
,
398 ArrayList receipt_queue
)
400 if (! children_docs
.Contains (parent
.Uri
))
403 ArrayList children_list
= (ArrayList
) children_docs
[parent
.Uri
];
404 IndexerChildIndexablesReceipt child_r
;
405 child_r
= new IndexerChildIndexablesReceipt ();
406 ArrayList child_indexable_list
= new ArrayList ();
408 foreach (Uri uri
in children_list
) {
409 Indexable child_indexable
;
410 child_indexable
= new Indexable (IndexableType
.PropertyChange
, uri
);
411 Log
.Debug ("Creating property change child indexable for {1} (parent {0})", parent
.Uri
, uri
);
413 child_indexable
.SetChildOf (parent
);
414 child_indexable_list
.Add (child_indexable
);
417 child_r
.Children
= child_indexable_list
;
418 receipt_queue
.Add (child_r
);
421 ////////////////////////////////////////////////////////////////
423 public void OptimizeNow ()
427 writer
= new IndexWriter (PrimaryStore
, null, false);
431 if (SecondaryStore
!= null) {
432 writer
= new IndexWriter (SecondaryStore
, null, false);
439 public void Merge (LuceneCommon index_to_merge
)
441 // FIXME: Error recovery
443 // Merge the primary index
444 IndexWriter primary_writer
;
445 Lucene
.Net
.Store
.Directory
[] primary_store
= {index_to_merge.PrimaryStore}
;
446 primary_writer
= new IndexWriter (PrimaryStore
, null, false);
448 primary_writer
.AddIndexes (primary_store
);
449 primary_writer
.Close ();
451 // Merge the secondary index
452 IndexWriter secondary_writer
;
453 Lucene
.Net
.Store
.Directory
[] secondary_store
= {index_to_merge.SecondaryStore}
;
454 secondary_writer
= new IndexWriter (SecondaryStore
, null, false);
456 secondary_writer
.AddIndexes (secondary_store
);
457 secondary_writer
.Close ();