cvsimport
[beagle.git] / beagled / Lucene.Net / Index / IndexWriter.cs
blob0e22bbe2038284c460d885a4e373d75808f55430
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Analyzer = Lucene.Net.Analysis.Analyzer;
19 using Document = Lucene.Net.Documents.Document;
20 using Similarity = Lucene.Net.Search.Similarity;
21 using Directory = Lucene.Net.Store.Directory;
22 using FSDirectory = Lucene.Net.Store.FSDirectory;
23 using IndexInput = Lucene.Net.Store.IndexInput;
24 using IndexOutput = Lucene.Net.Store.IndexOutput;
25 using Lock = Lucene.Net.Store.Lock;
26 using RAMDirectory = Lucene.Net.Store.RAMDirectory;
28 namespace Lucene.Net.Index
32 /// <summary>An IndexWriter creates and maintains an index.
33 /// The third argument to the
34 /// <a href="#IndexWriter(Lucene.Net.store.Directory, Lucene.Net.analysis.Analyzer, boolean)"><b>constructor</b></a>
35 /// determines whether a new index is created, or whether an existing index is
36 /// opened for the addition of new documents.
37 /// In either case, documents are added with the <a
38 /// href="#addDocument(Lucene.Net.document.Document)"><b>addDocument</b></a> method.
39 /// When finished adding documents, <a href="#close()"><b>close</b></a> should be called.
40 /// <p>If an index will not have more documents added for a while and optimal search
41 /// performance is desired, then the <a href="#optimize()"><b>optimize</b></a>
42 /// method should be called before the index is closed.
43 /// </summary>
44 /// <summary><p>Opening an IndexWriter creates a lock file for the directory in use. Trying to open
45 /// another IndexWriter on the same directory will lead to an IOException. The IOException
46 /// is also thrown if an IndexReader on the same directory is used to delete documents
47 /// from the index.
48 /// </summary>
49 /// <seealso cref="IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion">
50 /// </seealso>
52 public class IndexWriter
54 private class AnonymousClassWith : Lock.With
56 private void InitBlock(bool create, IndexWriter enclosingInstance)
58 this.create = create;
59 this.enclosingInstance = enclosingInstance;
61 private bool create;
62 private IndexWriter enclosingInstance;
63 public IndexWriter Enclosing_Instance
65 get
67 return enclosingInstance;
71 internal AnonymousClassWith(bool create, IndexWriter enclosingInstance, Lucene.Net.Store.Lock Param1, long Param2) : base(Param1, Param2)
73 InitBlock(create, enclosingInstance);
75 public override System.Object DoBody()
77 if (create)
78 Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory);
79 else
80 Enclosing_Instance.segmentInfos.Read(Enclosing_Instance.directory);
81 return null;
84 private class AnonymousClassWith1 : Lock.With
86 private void InitBlock(System.Collections.ArrayList segmentsToDelete, IndexWriter enclosingInstance)
88 this.segmentsToDelete = segmentsToDelete;
89 this.enclosingInstance = enclosingInstance;
91 private System.Collections.ArrayList segmentsToDelete;
92 private IndexWriter enclosingInstance;
93 public IndexWriter Enclosing_Instance
95 get
97 return enclosingInstance;
101 internal AnonymousClassWith1(System.Collections.ArrayList segmentsToDelete, IndexWriter enclosingInstance, Lucene.Net.Store.Lock Param1, long Param2):base(Param1, Param2)
103 InitBlock(segmentsToDelete, enclosingInstance);
105 public override System.Object DoBody()
107 Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory); // commit changes
108 Enclosing_Instance.DeleteSegments(segmentsToDelete); // delete now-unused segments
109 return null;
112 private class AnonymousClassWith2 : Lock.With
114 private void InitBlock(System.String mergedName, System.Collections.ArrayList filesToDelete, IndexWriter enclosingInstance)
116 this.mergedName = mergedName;
117 this.filesToDelete = filesToDelete;
118 this.enclosingInstance = enclosingInstance;
120 private System.String mergedName;
121 private System.Collections.ArrayList filesToDelete;
122 private IndexWriter enclosingInstance;
123 public IndexWriter Enclosing_Instance
127 return enclosingInstance;
131 internal AnonymousClassWith2(System.String mergedName, System.Collections.ArrayList filesToDelete, IndexWriter enclosingInstance, Lucene.Net.Store.Lock Param1, long Param2):base(Param1, Param2)
133 InitBlock(mergedName, filesToDelete, enclosingInstance);
135 public override System.Object DoBody()
137 // make compound file visible for SegmentReaders
138 Enclosing_Instance.directory.RenameFile(mergedName + ".tmp", mergedName + ".cfs");
139 // delete now unused files of segment
140 Enclosing_Instance.DeleteFiles(filesToDelete);
141 return null;
144 private class AnonymousClassWith3 : Lock.With
146 private void InitBlock(System.Collections.ArrayList segmentsToDelete, IndexWriter enclosingInstance)
148 this.segmentsToDelete = segmentsToDelete;
149 this.enclosingInstance = enclosingInstance;
151 private System.Collections.ArrayList segmentsToDelete;
152 private IndexWriter enclosingInstance;
153 public IndexWriter Enclosing_Instance
157 return enclosingInstance;
161 internal AnonymousClassWith3(System.Collections.ArrayList segmentsToDelete, IndexWriter enclosingInstance, Lucene.Net.Store.Lock Param1, long Param2):base(Param1, Param2)
163 InitBlock(segmentsToDelete, enclosingInstance);
165 public override System.Object DoBody()
167 Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory); // commit before deleting
168 Enclosing_Instance.DeleteSegments(segmentsToDelete); // delete now-unused segments
169 return null;
172 private class AnonymousClassWith4 : Lock.With
174 private void InitBlock(System.String mergedName, System.Collections.ArrayList filesToDelete, IndexWriter enclosingInstance)
176 this.mergedName = mergedName;
177 this.filesToDelete = filesToDelete;
178 this.enclosingInstance = enclosingInstance;
180 private System.String mergedName;
181 private System.Collections.ArrayList filesToDelete;
182 private IndexWriter enclosingInstance;
183 public IndexWriter Enclosing_Instance
187 return enclosingInstance;
191 internal AnonymousClassWith4(System.String mergedName, System.Collections.ArrayList filesToDelete, IndexWriter enclosingInstance, Lucene.Net.Store.Lock Param1, long Param2):base(Param1, Param2)
193 InitBlock(mergedName, filesToDelete, enclosingInstance);
195 public override System.Object DoBody()
197 // make compound file visible for SegmentReaders
198 Enclosing_Instance.directory.RenameFile(mergedName + ".tmp", mergedName + ".cfs");
199 // delete now unused files of segment
200 Enclosing_Instance.DeleteFiles(filesToDelete);
201 return null;
204 private void InitBlock()
206 similarity = Similarity.GetDefault();
209 /// <summary> Default value is 1,000.</summary>
210 public const long WRITE_LOCK_TIMEOUT = 1000;
212 /// <summary> Default value is 10,000.</summary>
213 public const long COMMIT_LOCK_TIMEOUT = 10000;
215 public const System.String WRITE_LOCK_NAME = "write.lock";
216 public const System.String COMMIT_LOCK_NAME = "commit.lock";
218 /// <summary> Default value is 10. Change using {@link #SetMergeFactor(int)}.</summary>
219 public const int DEFAULT_MERGE_FACTOR = 10;
221 /// <summary> Default value is 10. Change using {@link #SetMaxBufferedDocs(int)}.</summary>
222 public const int DEFAULT_MAX_BUFFERED_DOCS = 10;
224 /// <deprecated> use {@link #DEFAULT_MAX_BUFFERED_DOCS} instead
225 /// </deprecated>
226 public static readonly int DEFAULT_MIN_MERGE_DOCS = DEFAULT_MAX_BUFFERED_DOCS;
228 /// <summary> Default value is {@link Integer#MAX_VALUE}. Change using {@link #SetMaxMergeDocs(int)}.</summary>
229 public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
231 /// <summary> Default value is 10,000. Change using {@link #SetMaxFieldLength(int)}.</summary>
232 public const int DEFAULT_MAX_FIELD_LENGTH = 10000;
234 /// <summary> Default value is 128. Change using {@link #SetTermIndexInterval(int)}.</summary>
235 public const int DEFAULT_TERM_INDEX_INTERVAL = 128;
237 private Directory directory; // where this index resides
238 private Analyzer analyzer; // how to analyze text
240 private Similarity similarity; // how to normalize
242 private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
243 private Directory ramDirectory = new RAMDirectory(); // for temp segs
245 private int singleDocSegmentsCount = 0; // for speeding decision on merge candidates
247 private Lock writeLock;
249 private int termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL;
251 /// <summary>Use compound file setting. Defaults to true, minimizing the number of
252 /// files used. Setting this to false may improve indexing performance, but
253 /// may also cause file handle problems.
254 /// </summary>
255 private bool useCompoundFile = true;
257 private bool closeDir;
259 /// <summary>Get the current setting of whether to use the compound file format.
260 /// Note that this just returns the value you set with setUseCompoundFile(boolean)
261 /// or the default. You cannot use this to query the status of an existing index.
262 /// </summary>
263 /// <seealso cref="SetUseCompoundFile(boolean)">
264 /// </seealso>
265 public virtual bool GetUseCompoundFile()
267 return useCompoundFile;
270 /// <summary>Setting to turn on usage of a compound file. When on, multiple files
271 /// for each segment are merged into a single file once the segment creation
272 /// is finished. This is done regardless of what directory is in use.
273 /// </summary>
274 public virtual void SetUseCompoundFile(bool value_Renamed)
276 useCompoundFile = value_Renamed;
279 /// <summary>Expert: Set the Similarity implementation used by this IndexWriter.
280 ///
281 /// </summary>
282 /// <seealso cref="Similarity.SetDefault(Similarity)">
283 /// </seealso>
284 public virtual void SetSimilarity(Similarity similarity)
286 this.similarity = similarity;
289 /// <summary>Expert: Return the Similarity implementation used by this IndexWriter.
290 ///
291 /// <p>This defaults to the current value of {@link Similarity#GetDefault()}.
292 /// </summary>
293 public virtual Similarity GetSimilarity()
295 return this.similarity;
298 /// <summary>Expert: Set the interval between indexed terms. Large values cause less
299 /// memory to be used by IndexReader, but slow random-access to terms. Small
300 /// values cause more memory to be used by an IndexReader, and speed
301 /// random-access to terms.
302 ///
303 /// This parameter determines the amount of computation required per query
304 /// term, regardless of the number of documents that contain that term. In
305 /// particular, it is the maximum number of other terms that must be
306 /// scanned before a term is located and its frequency and position information
307 /// may be processed. In a large index with user-entered query terms, query
308 /// processing time is likely to be dominated not by term lookup but rather
309 /// by the processing of frequency and positional data. In a small index
310 /// or when many uncommon query terms are generated (e.g., by wildcard
311 /// queries) term lookup may become a dominant cost.
312 ///
313 /// In particular, <code>numUniqueTerms/interval</code> terms are read into
314 /// memory by an IndexReader, and, on average, <code>interval/2</code> terms
315 /// must be scanned for each random term access.
316 ///
317 /// </summary>
318 /// <seealso cref="DEFAULT_TERM_INDEX_INTERVAL">
319 /// </seealso>
320 public virtual void SetTermIndexInterval(int interval)
322 this.termIndexInterval = interval;
325 /// <summary>Expert: Return the interval between indexed terms.
326 ///
327 /// </summary>
328 /// <seealso cref="SetTermIndexInterval(int)">
329 /// </seealso>
330 public virtual int GetTermIndexInterval()
332 return termIndexInterval;
335 /// <summary> Constructs an IndexWriter for the index in <code>path</code>.
336 /// Text will be analyzed with <code>a</code>. If <code>create</code>
337 /// is true, then a new, empty index will be created in
338 /// <code>path</code>, replacing the index already there, if any.
339 ///
340 /// </summary>
341 /// <param name="path">the path to the index directory
342 /// </param>
343 /// <param name="a">the analyzer to use
344 /// </param>
345 /// <param name="create"><code>true</code> to create the index or overwrite
346 /// the existing one; <code>false</code> to append to the existing
347 /// index
348 /// </param>
349 /// <throws> IOException if the directory cannot be read/written to, or </throws>
350 /// <summary> if it does not exist, and <code>create</code> is
351 /// <code>false</code>
352 /// </summary>
353 public IndexWriter(System.String path, Analyzer a, bool create) : this(FSDirectory.GetDirectory(path, create), a, create, true)
357 /// <summary> Constructs an IndexWriter for the index in <code>path</code>.
358 /// Text will be analyzed with <code>a</code>. If <code>create</code>
359 /// is true, then a new, empty index will be created in
360 /// <code>path</code>, replacing the index already there, if any.
361 ///
362 /// </summary>
363 /// <param name="path">the path to the index directory
364 /// </param>
365 /// <param name="a">the analyzer to use
366 /// </param>
367 /// <param name="create"><code>true</code> to create the index or overwrite
368 /// the existing one; <code>false</code> to append to the existing
369 /// index
370 /// </param>
371 /// <throws> IOException if the directory cannot be read/written to, or </throws>
372 /// <summary> if it does not exist, and <code>create</code> is
373 /// <code>false</code>
374 /// </summary>
375 public IndexWriter(System.IO.FileInfo path, Analyzer a, bool create) : this(FSDirectory.GetDirectory(path, create), a, create, true)
379 /// <summary> Constructs an IndexWriter for the index in <code>d</code>.
380 /// Text will be analyzed with <code>a</code>. If <code>create</code>
381 /// is true, then a new, empty index will be created in
382 /// <code>d</code>, replacing the index already there, if any.
383 ///
384 /// </summary>
385 /// <param name="d">the index directory
386 /// </param>
387 /// <param name="a">the analyzer to use
388 /// </param>
389 /// <param name="create"><code>true</code> to create the index or overwrite
390 /// the existing one; <code>false</code> to append to the existing
391 /// index
392 /// </param>
393 /// <throws> IOException if the directory cannot be read/written to, or </throws>
394 /// <summary> if it does not exist, and <code>create</code> is
395 /// <code>false</code>
396 /// </summary>
397 public IndexWriter(Directory d, Analyzer a, bool create) : this(d, a, create, false)
401 private IndexWriter(Directory d, Analyzer a, bool create, bool closeDir)
403 InitBlock();
404 this.closeDir = closeDir;
405 directory = d;
406 analyzer = a;
408 Lock writeLock = directory.MakeLock(IndexWriter.WRITE_LOCK_NAME);
409 if (!writeLock.Obtain(WRITE_LOCK_TIMEOUT))
410 // obtain write lock
412 throw new System.IO.IOException("Index locked for write: " + writeLock);
414 this.writeLock = writeLock; // save it
416 lock (directory)
418 // in- & inter-process sync
419 new AnonymousClassWith(create, this, directory.MakeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
423 /// <summary>Determines the largest number of documents ever merged by addDocument().
424 /// Small values (e.g., less than 10,000) are best for interactive indexing,
425 /// as this limits the length of pauses while indexing to a few seconds.
426 /// Larger values are best for batched indexing and speedier searches.
427 ///
428 /// <p>The default value is {@link Integer#MAX_VALUE}.
429 /// </summary>
430 public virtual void SetMaxMergeDocs(int maxMergeDocs)
432 this.maxMergeDocs = maxMergeDocs;
435 /// <seealso cref="setMaxMergeDocs">
436 /// </seealso>
437 public virtual int GetMaxMergeDocs()
439 return maxMergeDocs;
442 /// <summary> The maximum number of terms that will be indexed for a single field in a
443 /// document. This limits the amount of memory required for indexing, so that
444 /// collections with very large files will not crash the indexing process by
445 /// running out of memory.<p/>
446 /// Note that this effectively truncates large documents, excluding from the
447 /// index terms that occur further in the document. If you know your source
448 /// documents are large, be sure to set this value high enough to accomodate
449 /// the expected size. If you set it to Integer.MAX_VALUE, then the only limit
450 /// is your memory, but you should anticipate an OutOfMemoryError.<p/>
451 /// By default, no more than 10,000 terms will be indexed for a field.
452 /// </summary>
453 public virtual void SetMaxFieldLength(int maxFieldLength)
455 this.maxFieldLength = maxFieldLength;
458 /// <seealso cref="setMaxFieldLength">
459 /// </seealso>
460 public virtual int GetMaxFieldLength()
462 return maxFieldLength;
465 /// <summary>Determines the minimal number of documents required before the buffered
466 /// in-memory documents are merging and a new Segment is created.
467 /// Since Documents are merged in a {@link Lucene.Net.store.RAMDirectory},
468 /// large value gives faster indexing. At the same time, mergeFactor limits
469 /// the number of files open in a FSDirectory.
470 ///
471 /// <p> The default value is 10.
472 ///
473 /// </summary>
474 /// <throws> IllegalArgumentException if maxBufferedDocs is smaller than 1 </throws>
475 public virtual void SetMaxBufferedDocs(int maxBufferedDocs)
477 if (maxBufferedDocs < 2)
478 throw new System.ArgumentException("maxBufferedDocs must at least be 2");
479 this.minMergeDocs = maxBufferedDocs;
482 /// <seealso cref="setMaxBufferedDocs">
483 /// </seealso>
484 public virtual int GetMaxBufferedDocs()
486 return minMergeDocs;
489 /// <summary>Determines how often segment indices are merged by addDocument(). With
490 /// smaller values, less RAM is used while indexing, and searches on
491 /// unoptimized indices are faster, but indexing speed is slower. With larger
492 /// values, more RAM is used during indexing, and while searches on unoptimized
493 /// indices are slower, indexing is faster. Thus larger values (> 10) are best
494 /// for batch index creation, and smaller values (< 10) for indices that are
495 /// interactively maintained.
496 ///
497 /// <p>This must never be less than 2. The default value is 10.
498 /// </summary>
499 public virtual void SetMergeFactor(int mergeFactor)
501 if (mergeFactor < 2)
502 throw new System.ArgumentException("mergeFactor cannot be less than 2");
503 this.mergeFactor = mergeFactor;
506 /// <seealso cref="setMergeFactor">
507 /// </seealso>
508 public virtual int GetMergeFactor()
510 return mergeFactor;
513 /// <summary>If non-null, information about merges and a message when
514 /// maxFieldLength is reached will be printed to this.
515 /// </summary>
516 public virtual void SetInfoStream(System.IO.TextWriter infoStream)
518 this.infoStream = infoStream;
521 /// <seealso cref="setInfoStream">
522 /// </seealso>
523 public virtual System.IO.TextWriter GetInfoStream()
525 return infoStream;
528 /// <summary>Flushes all changes to an index and closes all associated files. </summary>
529 public virtual void Close()
531 lock (this)
533 FlushRamSegments();
534 ramDirectory.Close();
535 if (writeLock != null)
537 writeLock.Release(); // release write lock
538 writeLock = null;
540 if (closeDir)
541 directory.Close();
542 System.GC.SuppressFinalize(this);
546 /// <summary>Release the write lock, if needed. </summary>
547 ~IndexWriter()
549 if (writeLock != null)
551 writeLock.Release(); // release write lock
552 writeLock = null;
556 /// <summary>Returns the Directory used by this index. </summary>
557 public virtual Directory GetDirectory()
559 return directory;
562 /// <summary>Returns the analyzer used by this index. </summary>
563 public virtual Analyzer GetAnalyzer()
565 return analyzer;
569 /// <summary>Returns the number of documents currently in this index. </summary>
570 public virtual int DocCount()
572 lock (this)
574 int count = 0;
575 for (int i = 0; i < segmentInfos.Count; i++)
577 SegmentInfo si = segmentInfos.Info(i);
578 count += si.docCount;
580 return count;
584 /// <summary> The maximum number of terms that will be indexed for a single field in a
585 /// document. This limits the amount of memory required for indexing, so that
586 /// collections with very large files will not crash the indexing process by
587 /// running out of memory.<p/>
588 /// Note that this effectively truncates large documents, excluding from the
589 /// index terms that occur further in the document. If you know your source
590 /// documents are large, be sure to set this value high enough to accomodate
591 /// the expected size. If you set it to Integer.MAX_VALUE, then the only limit
592 /// is your memory, but you should anticipate an OutOfMemoryError.<p/>
593 /// By default, no more than 10,000 terms will be indexed for a field.
594 ///
595 /// </summary>
596 /// <deprecated> use {@link #setMaxFieldLength} instead
597 /// </deprecated>
598 public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH;
600 /// <summary> Adds a document to this index. If the document contains more than
601 /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are
602 /// discarded.
603 /// </summary>
604 public virtual void AddDocument(Document doc)
606 AddDocument(doc, analyzer);
609 /// <summary> Adds a document to this index, using the provided analyzer instead of the
610 /// value of {@link #GetAnalyzer()}. If the document contains more than
611 /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are
612 /// discarded.
613 /// </summary>
614 public virtual void AddDocument(Document doc, Analyzer analyzer)
616 DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, this);
617 dw.SetInfoStream(infoStream);
618 System.String segmentName = NewSegmentName();
619 dw.AddDocument(segmentName, doc);
620 lock (this)
622 segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
623 singleDocSegmentsCount++;
624 MaybeMergeSegments();
628 internal int GetSegmentsCounter()
630 return segmentInfos.counter;
633 private System.String NewSegmentName()
635 lock (this)
637 return "_" + SupportClass.Number.ToString(segmentInfos.counter++, SupportClass.Number.MAX_RADIX);
641 /// <summary>Determines how often segment indices are merged by addDocument(). With
642 /// smaller values, less RAM is used while indexing, and searches on
643 /// unoptimized indices are faster, but indexing speed is slower. With larger
644 /// values, more RAM is used during indexing, and while searches on unoptimized
645 /// indices are slower, indexing is faster. Thus larger values (> 10) are best
646 /// for batch index creation, and smaller values (< 10) for indices that are
647 /// interactively maintained.
648 ///
649 /// <p>This must never be less than 2. The default value is 10.
650 /// </summary>
651 /// <deprecated> use {@link #setMergeFactor} instead
652 /// </deprecated>
653 public int mergeFactor = DEFAULT_MERGE_FACTOR;
655 /// <summary>Determines the minimal number of documents required before the buffered
656 /// in-memory documents are merging and a new Segment is created.
657 /// Since Documents are merged in a {@link Lucene.Net.store.RAMDirectory},
658 /// large value gives faster indexing. At the same time, mergeFactor limits
659 /// the number of files open in a FSDirectory.
660 ///
661 /// <p> The default value is 10.
662 /// </summary>
663 /// <deprecated> use {@link #setMaxBufferedDocs} instead
664 /// </deprecated>
665 public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS;
668 /// <summary>Determines the largest number of documents ever merged by addDocument().
669 /// Small values (e.g., less than 10,000) are best for interactive indexing,
670 /// as this limits the length of pauses while indexing to a few seconds.
671 /// Larger values are best for batched indexing and speedier searches.
672 ///
673 /// <p>The default value is {@link Integer#MAX_VALUE}.
674 /// </summary>
675 /// <deprecated> use {@link #setMaxMergeDocs} instead
676 /// </deprecated>
677 public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
679 /// <summary>If non-null, information about merges will be printed to this.</summary>
680 /// <deprecated> use {@link #setInfoStream} instead
681 /// </deprecated>
682 public System.IO.TextWriter infoStream = null;
684 /// <summary>Merges all segments together into a single segment, optimizing an index
685 /// for search.
686 /// </summary>
687 public virtual void Optimize()
689 lock (this)
691 FlushRamSegments();
692 while (segmentInfos.Count > 1 || (segmentInfos.Count == 1 && (SegmentReader.HasDeletions(segmentInfos.Info(0)) || segmentInfos.Info(0).dir != directory || (useCompoundFile && (!SegmentReader.UsesCompoundFile(segmentInfos.Info(0)) || SegmentReader.HasSeparateNorms(segmentInfos.Info(0)))))))
694 int minSegment = segmentInfos.Count - mergeFactor;
695 MergeSegments(minSegment < 0?0:minSegment);
700 /// <summary>Merges all segments from an array of indexes into this index.
701 ///
702 /// <p>This may be used to parallelize batch indexing. A large document
703 /// collection can be broken into sub-collections. Each sub-collection can be
704 /// indexed in parallel, on a different thread, process or machine. The
705 /// complete index can then be created by merging sub-collection indexes
706 /// with this method.
707 ///
708 /// <p>After this completes, the index is optimized.
709 /// </summary>
710 public virtual void AddIndexes(Directory[] dirs)
712 lock (this)
714 Optimize(); // start with zero or 1 seg
716 int start = segmentInfos.Count;
718 for (int i = 0; i < dirs.Length; i++)
720 SegmentInfos sis = new SegmentInfos(); // read infos from dir
721 sis.Read(dirs[i]);
722 for (int j = 0; j < sis.Count; j++)
724 segmentInfos.Add(sis.Info(j)); // add each info
728 // merge newly added segments in log(n) passes
729 while (segmentInfos.Count > start + mergeFactor)
731 for (int base_Renamed = start + 1; base_Renamed < segmentInfos.Count; base_Renamed++)
733 int end = System.Math.Min(segmentInfos.Count, base_Renamed + mergeFactor);
734 if (end - base_Renamed > 1)
735 MergeSegments(base_Renamed, end);
739 Optimize(); // final cleanup
743 /// <summary>Merges the provided indexes into this index.
744 /// <p>After this completes, the index is optimized. </p>
745 /// <p>The provided IndexReaders are not closed.</p>
746 /// </summary>
747 public virtual void AddIndexes(IndexReader[] readers)
749 lock (this)
752 Optimize(); // start with zero or 1 seg
754 System.String mergedName = NewSegmentName();
755 SegmentMerger merger = new SegmentMerger(this, mergedName);
757 System.Collections.ArrayList segmentsToDelete = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
758 IndexReader sReader = null;
759 if (segmentInfos.Count == 1)
761 // add existing index, if any
762 sReader = SegmentReader.Get(segmentInfos.Info(0));
763 merger.Add(sReader);
764 segmentsToDelete.Add(sReader); // queue segment for deletion
767 for (int i = 0; i < readers.Length; i++)
768 // add new indexes
769 merger.Add(readers[i]);
771 int docCount = merger.Merge(); // merge 'em
773 segmentInfos.RemoveRange(0, segmentInfos.Count - 0); // pop old infos & add new
774 segmentInfos.Add(new SegmentInfo(mergedName, docCount, directory));
776 if (sReader != null)
777 sReader.Close();
779 lock (directory)
781 // in- & inter-process sync
782 new AnonymousClassWith1(segmentsToDelete, this, directory.MakeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
785 if (useCompoundFile)
787 System.Collections.ArrayList filesToDelete = merger.CreateCompoundFile(mergedName + ".tmp");
788 lock (directory)
790 // in- & inter-process sync
791 new AnonymousClassWith2(mergedName, filesToDelete, this, directory.MakeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
797 /// <summary>Merges all RAM-resident segments. </summary>
798 private void FlushRamSegments()
800 int minSegment = segmentInfos.Count - 1;
801 int docCount = 0;
802 while (minSegment >= 0 && (segmentInfos.Info(minSegment)).dir == ramDirectory)
804 docCount += segmentInfos.Info(minSegment).docCount;
805 minSegment--;
807 if (minSegment < 0 || (docCount + segmentInfos.Info(minSegment).docCount) > mergeFactor || !(segmentInfos.Info(segmentInfos.Count - 1).dir == ramDirectory))
808 minSegment++;
809 if (minSegment >= segmentInfos.Count)
810 return ; // none to merge
811 MergeSegments(minSegment);
814 /// <summary>Incremental segment merger. </summary>
815 private void MaybeMergeSegments()
817 long targetMergeDocs = minMergeDocs;
818 while (targetMergeDocs <= maxMergeDocs)
820 // find segments smaller than current target size
821 int minSegment = segmentInfos.Count - singleDocSegmentsCount; // top 1-doc segments are taken for sure
822 int mergeDocs = singleDocSegmentsCount;
823 while (--minSegment >= 0)
825 SegmentInfo si = segmentInfos.Info(minSegment);
826 if (si.docCount >= targetMergeDocs)
827 break;
828 mergeDocs += si.docCount;
831 if (mergeDocs >= targetMergeDocs) {
832 // found a merge to do
833 MergeSegments(minSegment + 1);
834 singleDocSegmentsCount = 0;
835 } else
836 break;
838 targetMergeDocs *= mergeFactor; // increase target size
842 /// <summary>Pops segments off of segmentInfos stack down to minSegment, merges them,
843 /// and pushes the merged index onto the top of the segmentInfos stack.
844 /// </summary>
845 private void MergeSegments(int minSegment)
847 MergeSegments(minSegment, segmentInfos.Count);
850 /// <summary>Merges the named range of segments, replacing them in the stack with a
851 /// single segment.
852 /// </summary>
853 private void MergeSegments(int minSegment, int end)
855 System.String mergedName = NewSegmentName();
856 if (infoStream != null)
857 infoStream.Write("merging segments");
858 SegmentMerger merger = new SegmentMerger(this, mergedName);
860 System.Collections.ArrayList segmentsToDelete = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
861 for (int i = minSegment; i < end; i++)
863 SegmentInfo si = segmentInfos.Info(i);
864 if (infoStream != null)
865 infoStream.Write(" " + si.name + " (" + si.docCount + " docs)");
866 IndexReader reader = SegmentReader.Get(si);
867 merger.Add(reader);
868 if ((reader.Directory() == this.directory) || (reader.Directory() == this.ramDirectory))
869 segmentsToDelete.Add(reader); // queue segment for deletion
872 int mergedDocCount = merger.Merge();
874 if (infoStream != null)
876 infoStream.WriteLine(" into " + mergedName + " (" + mergedDocCount + " docs)");
879 for (int i = end - 1; i >= minSegment; i--)
880 // remove old infos & add new
881 segmentInfos.RemoveAt(i);
882 segmentInfos.Add(new SegmentInfo(mergedName, mergedDocCount, directory));
884 // close readers before we attempt to delete now-obsolete segments
885 merger.CloseReaders();
887 lock (directory)
889 // in- & inter-process sync
890 new AnonymousClassWith3(segmentsToDelete, this, directory.MakeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
893 if (useCompoundFile)
895 System.Collections.ArrayList filesToDelete = merger.CreateCompoundFile(mergedName + ".tmp");
896 lock (directory)
898 // in- & inter-process sync
899 new AnonymousClassWith4(mergedName, filesToDelete, this, directory.MakeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
905 * Some operating systems (e.g. Windows) don't permit a file to be deleted
906 * while it is opened for read (e.g. by another process or thread). So we
907 * assume that when a delete fails it is because the file is open in another
908 * process, and queue the file for subsequent deletion.
911 private void DeleteSegments(System.Collections.ArrayList segments)
913 System.Collections.ArrayList deletable = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
915 DeleteFiles(ReadDeleteableFiles(), deletable); // try to delete deleteable
917 for (int i = 0; i < segments.Count; i++)
919 SegmentReader reader = (SegmentReader) segments[i];
920 if (reader.Directory() == this.directory)
921 DeleteFiles(reader.Files(), deletable);
922 // try to delete our files
923 else
924 DeleteFiles(reader.Files(), reader.Directory()); // delete other files
927 WriteDeleteableFiles(deletable); // note files we can't delete
930 private void DeleteFiles(System.Collections.ArrayList files)
932 System.Collections.ArrayList deletable = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
933 DeleteFiles(ReadDeleteableFiles(), deletable); // try to delete deleteable
934 DeleteFiles(files, deletable); // try to delete our files
935 WriteDeleteableFiles(deletable); // note files we can't delete
938 private void DeleteFiles(System.Collections.ArrayList files, Directory directory)
940 for (int i = 0; i < files.Count; i++)
941 directory.DeleteFile((System.String) files[i]);
944 private void DeleteFiles(System.Collections.ArrayList files, System.Collections.ArrayList deletable)
946 for (int i = 0; i < files.Count; i++)
948 System.String file = (System.String) files[i];
951 directory.DeleteFile(file); // try to delete each file
953 catch (System.IO.IOException e)
955 // if delete fails
956 if (directory.FileExists(file))
958 if (infoStream != null)
960 infoStream.WriteLine(e.ToString() + "; Will re-try later.");
962 deletable.Add(file); // add to deletable
968 private System.Collections.ArrayList ReadDeleteableFiles()
970 System.Collections.ArrayList result = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
971 if (!directory.FileExists(IndexFileNames.DELETABLE))
972 return result;
974 IndexInput input = directory.OpenInput(IndexFileNames.DELETABLE);
977 for (int i = input.ReadInt(); i > 0; i--)
978 // read file names
979 result.Add(input.ReadString());
981 finally
983 input.Close();
985 return result;
988 private void WriteDeleteableFiles(System.Collections.ArrayList files)
990 IndexOutput output = directory.CreateOutput("deleteable.new");
993 output.WriteInt(files.Count);
994 for (int i = 0; i < files.Count; i++)
995 output.WriteString((System.String) files[i]);
997 finally
999 output.Close();
1001 directory.RenameFile("deleteable.new", IndexFileNames.DELETABLE);