2 // LuceneIndexingDriver.cs
4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This should be the only piece of source code that knows anything
29 // about Lucene's internals.
33 using System
.Collections
;
34 using System
.Diagnostics
;
35 using System
.Globalization
;
38 using System
.Threading
;
40 using System
.Xml
.Serialization
;
42 using Lucene
.Net
.Analysis
;
43 using Lucene
.Net
.Analysis
.Standard
;
44 using Lucene
.Net
.Documents
;
45 using Lucene
.Net
.Index
;
46 using Lucene
.Net
.QueryParsers
;
47 using LNS
= Lucene
.Net
.Search
;
51 namespace Beagle
.Daemon
{
53 public class LuceneIndexingDriver
: LuceneCommon
, IIndexer
{
55 object flush_lock
= new object ();
57 public LuceneIndexingDriver (string index_name
, int minor_version
, bool build_usercache
)
58 : base (index_name
, minor_version
)
66 text_cache
= TextCache
.UserCache
;
69 public LuceneIndexingDriver (string index_name
, int minor_version
)
70 : this (index_name
, minor_version
, true) { }
72 public LuceneIndexingDriver (string index_name
, bool build_usercache
)
73 : this (index_name
, 0, build_usercache
) { }
75 public LuceneIndexingDriver (string index_name
)
76 : this (index_name
, 0, true) { }
78 ////////////////////////////////////////////////////////////////
81 // Implementation of the IIndexer interface
84 public IndexerReceipt
[] Flush (IndexerRequest request
)
86 // This is just to keep a big block of code from being
87 // indented an extra eight spaces.
89 return Flush_Unlocked (request
);
92 private IndexerReceipt
[] Flush_Unlocked (IndexerRequest request
)
94 ArrayList receipt_queue
;
95 receipt_queue
= new ArrayList ();
97 IndexReader primary_reader
, secondary_reader
;
98 primary_reader
= IndexReader
.Open (PrimaryStore
);
99 secondary_reader
= IndexReader
.Open (SecondaryStore
);
101 // Step #1: Make our first pass over the list of
102 // indexables that make up our request. For each add
103 // or remove in the request, delete the associated
104 // items from the index. Assemble a query that will
105 // be used to find the secondary documents for any
106 // property change requests.
108 LNS
.BooleanQuery prop_change_query
= null;
109 LNS
.BooleanQuery prop_change_children_query
= null;
110 int delete_count
= 0;
112 foreach (Indexable indexable
in request
.Indexables
) {
114 switch (indexable
.Type
) {
116 case IndexableType
.Add
:
117 case IndexableType
.Remove
:
120 uri_str
= UriFu
.UriToEscapedString (indexable
.Uri
);
122 Logger
.Log
.Debug ("-{0}", indexable
.DisplayUri
);
125 term
= new Term ("Uri", uri_str
);
126 delete_count
+= primary_reader
.Delete (term
);
127 if (secondary_reader
!= null)
128 secondary_reader
.Delete (term
);
130 // When we delete an indexable, also delete any children.
131 // FIXME: Shouldn't we also delete any children of children, etc.?
132 term
= new Term ("ParentUri", uri_str
);
133 delete_count
+= primary_reader
.Delete (term
);
134 if (secondary_reader
!= null)
135 secondary_reader
.Delete (term
);
137 // If this is a strict removal (and not a deletion that
138 // we are doing in anticipation of adding something back),
139 // queue up a removed receipt.
140 if (indexable
.Type
== IndexableType
.Remove
) {
141 IndexerRemovedReceipt r
;
142 r
= new IndexerRemovedReceipt (indexable
.Uri
);
143 receipt_queue
.Add (r
);
148 case IndexableType
.PropertyChange
:
149 if (prop_change_query
== null) {
150 prop_change_query
= new LNS
.BooleanQuery ();
151 prop_change_children_query
= new LNS
.BooleanQuery ();
154 prop_change_query
.Add (UriQuery ("Uri", indexable
.Uri
), false, false);
155 prop_change_children_query
.Add (UriQuery ("ParentUri", indexable
.Uri
), false, false);
161 AdjustItemCount (-delete_count
);
163 SetItemCount (primary_reader
);
165 // Step #2: If we have are doing any property changes,
166 // we read in the current secondary documents and
167 // store them in a hash table for use later. Then we
168 // delete the current secondary documents.
169 Hashtable prop_change_docs
= null;
170 Hashtable prop_change_children_docs
= null;
171 if (prop_change_query
!= null) {
172 prop_change_docs
= UriFu
.NewHashtable ();
174 LNS
.IndexSearcher secondary_searcher
;
175 secondary_searcher
= new LNS
.IndexSearcher (secondary_reader
);
178 hits
= secondary_searcher
.Search (prop_change_query
);
180 ArrayList delete_terms
;
181 delete_terms
= new ArrayList ();
183 int N
= hits
.Length ();
185 for (int i
= 0; i
< N
; ++i
) {
189 uri_str
= doc
.Get ("Uri");
192 uri
= UriFu
.EscapedStringToUri (uri_str
);
193 prop_change_docs
[uri
] = doc
;
196 term
= new Term ("Uri", uri_str
);
197 delete_terms
.Add (term
);
200 secondary_searcher
.Close ();
202 foreach (Term term
in delete_terms
)
203 secondary_reader
.Delete (term
);
205 // Step #2.5: Find all child indexables for this document
206 // Store them to send them later as IndexerChildIndexablesReceipts
207 prop_change_children_docs
= UriFu
.NewHashtable ();
209 hits
= secondary_searcher
.Search (prop_change_children_query
);
212 for (int i
= 0; i
< N
; ++i
) {
215 string uri_str
, parent_uri_str
;
216 uri_str
= doc
.Get ("Uri");
217 parent_uri_str
= doc
.Get ("ParentUri");
220 uri
= UriFu
.EscapedStringToUri (uri_str
);
221 parent_uri
= UriFu
.EscapedStringToUri (parent_uri_str
);
223 if (! prop_change_children_docs
.Contains (parent_uri
)) {
224 ArrayList c_list
= new ArrayList ();
225 prop_change_children_docs
[parent_uri
] = c_list
;
228 ArrayList children_list
= (ArrayList
) prop_change_children_docs
[parent_uri
];
229 children_list
.Add (uri
);
232 secondary_searcher
.Close ();
236 // We are now done with the readers, so we close them.
237 primary_reader
.Close ();
238 secondary_reader
.Close ();
240 // FIXME: If we crash at exactly this point, we are in
241 // trouble. Items will have been dropped from the index
242 // without the proper replacements being added.
244 // Step #3: Make another pass across our list of indexables
245 // and write out any new documents.
247 if (text_cache
!= null)
248 text_cache
.BeginTransaction ();
250 IndexWriter primary_writer
, secondary_writer
;
251 primary_writer
= new IndexWriter (PrimaryStore
, IndexingAnalyzer
, false);
252 secondary_writer
= null;
254 foreach (Indexable indexable
in request
.Indexables
) {
256 if (indexable
.Type
== IndexableType
.Remove
)
259 IndexerAddedReceipt r
;
260 r
= new IndexerAddedReceipt (indexable
.Uri
);
261 receipt_queue
.Add (r
);
263 if (indexable
.Type
== IndexableType
.PropertyChange
) {
265 Logger
.Log
.Debug ("+{0} (props only)", indexable
.DisplayUri
);
266 r
.PropertyChangesOnly
= true;
269 doc
= prop_change_docs
[indexable
.Uri
] as Document
;
272 new_doc
= RewriteDocument (doc
, indexable
);
274 // Write out the new document...
275 if (secondary_writer
== null)
276 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
277 secondary_writer
.AddDocument (new_doc
);
279 // Add children property change indexables...
280 AddChildrenPropertyChange (
281 prop_change_children_docs
,
285 continue; // ...and proceed to the next Indexable
288 // If we reach this point we know we are dealing with an IndexableType.Add
290 if (indexable
.Type
!= IndexableType
.Add
)
291 throw new Exception ("When I said it was an IndexableType.Add, I meant it!");
293 Logger
.Log
.Debug ("+{0}", indexable
.DisplayUri
);
295 Filter filter
= null;
297 // If we have content, try to find a filter
298 // which we can use to process the indexable.
300 FilterFactory
.FilterIndexable (indexable
, text_cache
, out filter
);
301 } catch (Exception e
) {
302 Logger
.Log
.Error (e
, "Unable to filter {0} (mimetype={1})", indexable
.DisplayUri
, indexable
.MimeType
);
303 indexable
.NoContent
= true;
306 Document primary_doc
= null, secondary_doc
= null;
309 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
310 primary_writer
.AddDocument (primary_doc
);
311 } catch (Exception ex
) {
313 // If an exception was thrown, something bad probably happened
314 // while we were filtering the content. Set NoContent to true
315 // and try again -- that way it will at least end up in the index,
316 // even if we don't manage to extract the fulltext.
318 Logger
.Log
.Debug (ex
, "First attempt to index {0} failed", indexable
.DisplayUri
);
320 indexable
.NoContent
= true;
323 BuildDocuments (indexable
, out primary_doc
, out secondary_doc
);
324 primary_writer
.AddDocument (primary_doc
);
325 } catch (Exception ex2
) {
326 Logger
.Log
.Debug (ex2
, "Second attempt to index {0} failed, giving up...", indexable
.DisplayUri
);
330 if (filter
!= null) {
332 // Force the clean-up of temporary files, just in case.
335 r
.FilterName
= filter
.GetType ().ToString ();
336 r
.FilterVersion
= filter
.Version
;
338 // Create a receipt containing any child indexables.
339 if (filter
.ChildIndexables
.Count
> 0) {
340 Log
.Debug ("{0} (filtered with {1}) has generated {2} child indexable{3}", indexable
.DisplayUri
, r
.FilterName
, filter
.ChildIndexables
.Count
, filter
.ChildIndexables
.Count
> 1 ? "s" : "");
341 IndexerChildIndexablesReceipt cr
;
342 cr
= new IndexerChildIndexablesReceipt (indexable
, filter
.ChildIndexables
);
343 receipt_queue
.Add (cr
);
347 if (secondary_doc
!= null) {
348 if (secondary_writer
== null)
349 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
351 secondary_writer
.AddDocument (secondary_doc
);
356 // Clean up any temporary files associated with filtering this indexable.
357 indexable
.Cleanup ();
360 if (text_cache
!= null)
361 text_cache
.CommitTransaction ();
363 if (request
.OptimizeIndex
) {
364 Stopwatch watch
= new Stopwatch ();
365 Logger
.Log
.Debug ("Optimizing {0}", IndexName
);
367 primary_writer
.Optimize ();
368 if (secondary_writer
== null)
369 secondary_writer
= new IndexWriter (SecondaryStore
, IndexingAnalyzer
, false);
370 secondary_writer
.Optimize ();
372 Logger
.Log
.Debug ("{0} optimized in {1}", IndexName
, watch
);
375 // Step #4. Close our writers and return the events to
376 // indicate what has happened.
378 primary_writer
.Close ();
379 if (secondary_writer
!= null)
380 secondary_writer
.Close ();
382 IndexerReceipt
[] receipt_array
;
383 receipt_array
= new IndexerReceipt
[receipt_queue
.Count
];
384 for (int i
= 0; i
< receipt_queue
.Count
; ++i
)
385 receipt_array
[i
] = (IndexerReceipt
) receipt_queue
[i
];
387 return receipt_array
;
390 // Since some parent properties maybe stored in child properties
391 // as parent: property, any property change should be propagated
392 // to all its children as well.
393 private void AddChildrenPropertyChange (
394 Hashtable children_docs
,
396 ArrayList receipt_queue
)
398 if (! children_docs
.Contains (parent
.Uri
))
401 ArrayList children_list
= (ArrayList
) children_docs
[parent
.Uri
];
402 IndexerChildIndexablesReceipt child_r
;
403 child_r
= new IndexerChildIndexablesReceipt ();
404 ArrayList child_indexable_list
= new ArrayList ();
406 foreach (Uri uri
in children_list
) {
407 Indexable child_indexable
;
408 child_indexable
= new Indexable (IndexableType
.PropertyChange
, uri
);
409 Log
.Debug ("Creating property change child indexable for {1} (parent {0})", parent
.Uri
, uri
);
411 child_indexable
.SetChildOf (parent
);
412 child_indexable_list
.Add (child_indexable
);
415 child_r
.Children
= child_indexable_list
;
416 receipt_queue
.Add (child_r
);
419 ////////////////////////////////////////////////////////////////
421 public void OptimizeNow ()
425 writer
= new IndexWriter (PrimaryStore
, null, false);
429 if (SecondaryStore
!= null) {
430 writer
= new IndexWriter (SecondaryStore
, null, false);
437 public void Merge (LuceneCommon index_to_merge
)
439 // FIXME: Error recovery
441 // Merge the primary index
442 IndexWriter primary_writer
;
443 Lucene
.Net
.Store
.Directory
[] primary_store
= {index_to_merge.PrimaryStore}
;
444 primary_writer
= new IndexWriter (PrimaryStore
, null, false);
446 primary_writer
.AddIndexes (primary_store
);
447 primary_writer
.Close ();
449 // Merge the secondary index
450 IndexWriter secondary_writer
;
451 Lucene
.Net
.Store
.Directory
[] secondary_store
= {index_to_merge.SecondaryStore}
;
452 secondary_writer
= new IndexWriter (SecondaryStore
, null, false);
454 secondary_writer
.AddIndexes (secondary_store
);
455 secondary_writer
.Close ();