cvsimport
[beagle.git] / beagled / Lucene.Net / Index / SegmentMerger.cs
blob882c3c0e80b8dd00f45448366bb3526691cfd600
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Directory = Lucene.Net.Store.Directory;
19 using IndexOutput = Lucene.Net.Store.IndexOutput;
20 using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
22 namespace Lucene.Net.Index
25 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
26 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
27 /// segments.
28 /// <P>
29 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
30 ///
31 ///
32 /// </summary>
33 /// <seealso cref="merge">
34 /// </seealso>
35 /// <seealso cref="add">
36 /// </seealso>
37 public sealed class SegmentMerger
39 private void InitBlock()
41 termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
43 private Directory directory;
44 private System.String segment;
45 private int termIndexInterval;
47 private System.Collections.ArrayList readers = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
48 private FieldInfos fieldInfos;
50 /// <summary>This ctor used only by test code.
51 ///
52 /// </summary>
53 /// <param name="dir">The Directory to merge the other segments into
54 /// </param>
55 /// <param name="name">The name of the new segment
56 /// </param>
57 public /*internal*/ SegmentMerger(Directory dir, System.String name)
59 InitBlock();
60 directory = dir;
61 segment = name;
64 internal SegmentMerger(IndexWriter writer, System.String name)
66 InitBlock();
67 directory = writer.GetDirectory();
68 segment = name;
69 termIndexInterval = writer.GetTermIndexInterval();
72 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
73 /// <param name="reader">
74 /// </param>
75 public /*internal*/ void Add(IndexReader reader)
77 readers.Add(reader);
80 /// <summary> </summary>
81 /// <param name="i">The index of the reader to return
82 /// </param>
83 /// <returns> The ith reader to be merged
84 /// </returns>
85 internal IndexReader SegmentReader(int i)
87 return (IndexReader) readers[i];
90 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
91 /// <returns> The number of documents that were merged
92 /// </returns>
93 /// <throws> IOException </throws>
94 public /*internal*/ int Merge()
96 int value_Renamed;
98 value_Renamed = MergeFields();
99 MergeTerms();
100 MergeNorms();
102 if (fieldInfos.HasVectors())
103 MergeVectors();
105 return value_Renamed;
108 /// <summary> close all IndexReaders that have been added.
109 /// Should not be called before merge().
110 /// </summary>
111 /// <throws> IOException </throws>
112 public /*internal*/ void CloseReaders()
114 for (int i = 0; i < readers.Count; i++)
116 // close readers
117 IndexReader reader = (IndexReader) readers[i];
118 reader.Close();
122 public System.Collections.ArrayList CreateCompoundFile(System.String fileName)
124 CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName);
126 System.Collections.ArrayList files = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(IndexFileNames.COMPOUND_EXTENSIONS.Length + fieldInfos.Size()));
128 // Basic files
129 for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.Length; i++)
131 files.Add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]);
134 // Field norm files
135 for (int i = 0; i < fieldInfos.Size(); i++)
137 FieldInfo fi = fieldInfos.FieldInfo(i);
138 if (fi.isIndexed && !fi.omitNorms)
140 files.Add(segment + ".f" + i);
144 // Vector files
145 if (fieldInfos.HasVectors())
147 for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.Length; i++)
149 files.Add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
153 // Now merge all added files
154 System.Collections.IEnumerator it = files.GetEnumerator();
155 while (it.MoveNext())
157 cfsWriter.AddFile((System.String) it.Current);
160 // Perform the merge
161 cfsWriter.Close();
163 return files;
166 private void AddIndexed(IndexReader reader, FieldInfos fieldInfos, System.Collections.ICollection names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector)
168 System.Collections.IEnumerator i = names.GetEnumerator();
169 while (i.MoveNext())
171 System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry) i.Current;
172 System.String field = (System.String) e.Key;
173 fieldInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.HasNorms(field));
177 /// <summary> </summary>
178 /// <returns> The number of documents in all of the readers
179 /// </returns>
180 /// <throws> IOException </throws>
181 private int MergeFields()
183 fieldInfos = new FieldInfos(); // merge field names
184 int docCount = 0;
185 for (int i = 0; i < readers.Count; i++)
187 IndexReader reader = (IndexReader) readers[i];
188 AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
189 AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
190 AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
191 AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
192 AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
193 fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false);
195 fieldInfos.Write(directory, segment + ".fnm");
197 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
200 for (int i = 0; i < readers.Count; i++)
202 IndexReader reader = (IndexReader) readers[i];
203 int maxDoc = reader.MaxDoc();
204 for (int j = 0; j < maxDoc; j++)
205 if (!reader.IsDeleted(j))
207 // skip deleted docs
208 fieldsWriter.AddDocument(reader.Document(j));
209 docCount++;
213 finally
215 fieldsWriter.Close();
217 return docCount;
220 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
221 /// <throws> IOException </throws>
222 private void MergeVectors()
224 TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
228 for (int r = 0; r < readers.Count; r++)
230 IndexReader reader = (IndexReader) readers[r];
231 int maxDoc = reader.MaxDoc();
232 for (int docNum = 0; docNum < maxDoc; docNum++)
234 // skip deleted docs
235 if (reader.IsDeleted(docNum))
236 continue;
237 termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
241 finally
243 termVectorsWriter.Close();
247 private IndexOutput freqOutput = null;
248 private IndexOutput proxOutput = null;
249 private TermInfosWriter termInfosWriter = null;
250 private int skipInterval;
251 private SegmentMergeQueue queue = null;
253 private void MergeTerms()
257 freqOutput = directory.CreateOutput(segment + ".frq");
258 proxOutput = directory.CreateOutput(segment + ".prx");
259 termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
260 skipInterval = termInfosWriter.skipInterval;
261 queue = new SegmentMergeQueue(readers.Count);
263 MergeTermInfos();
265 finally
267 if (freqOutput != null)
268 freqOutput.Close();
269 if (proxOutput != null)
270 proxOutput.Close();
271 if (termInfosWriter != null)
272 termInfosWriter.Close();
273 if (queue != null)
274 queue.Close();
278 private void MergeTermInfos()
280 int base_Renamed = 0;
281 for (int i = 0; i < readers.Count; i++)
283 IndexReader reader = (IndexReader) readers[i];
284 TermEnum termEnum = reader.Terms();
285 SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader);
286 base_Renamed += reader.NumDocs();
287 if (smi.Next())
288 queue.Put(smi);
289 // initialize queue
290 else
291 smi.Close();
294 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];
296 while (queue.Size() > 0)
298 int matchSize = 0; // pop matching terms
299 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
300 Term term = match[0].term;
301 SegmentMergeInfo top = (SegmentMergeInfo) queue.Top();
303 while (top != null && term.CompareTo(top.term) == 0)
305 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
306 top = (SegmentMergeInfo) queue.Top();
309 MergeTermInfo(match, matchSize); // add new TermInfo
311 while (matchSize > 0)
313 SegmentMergeInfo smi = match[--matchSize];
314 if (smi.Next())
315 queue.Put(smi);
316 // restore queue
317 else
318 smi.Close(); // done with a segment
323 private TermInfo termInfo = new TermInfo(); // minimize consing
325 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
326 /// contains segments that are positioned at the same term. <code>N</code>
327 /// is the number of cells in the array actually occupied.
328 ///
329 /// </summary>
330 /// <param name="smis">array of segments
331 /// </param>
332 /// <param name="n">number of cells in the array actually occupied
333 /// </param>
334 private void MergeTermInfo(SegmentMergeInfo[] smis, int n)
336 long freqPointer = freqOutput.GetFilePointer();
337 long proxPointer = proxOutput.GetFilePointer();
339 int df = AppendPostings(smis, n); // append posting data
341 long skipPointer = WriteSkip();
343 if (df > 0)
345 // add an entry to the dictionary with pointers to prox and freq files
346 termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
347 termInfosWriter.Add(smis[0].term, termInfo);
351 /// <summary>Process postings from multiple segments all positioned on the
352 /// same term. Writes out merged entries into freqOutput and
353 /// the proxOutput streams.
354 ///
355 /// </summary>
356 /// <param name="smis">array of segments
357 /// </param>
358 /// <param name="n">number of cells in the array actually occupied
359 /// </param>
360 /// <returns> number of documents across all segments where this term was found
361 /// </returns>
362 private int AppendPostings(SegmentMergeInfo[] smis, int n)
364 int lastDoc = 0;
365 int df = 0; // number of docs w/ term
366 ResetSkip();
367 for (int i = 0; i < n; i++)
369 SegmentMergeInfo smi = smis[i];
370 TermPositions postings = smi.GetPositions();
371 int base_Renamed = smi.base_Renamed;
372 int[] docMap = smi.GetDocMap();
373 postings.Seek(smi.termEnum);
374 while (postings.Next())
376 int doc = postings.Doc();
377 if (docMap != null)
378 doc = docMap[doc]; // map around deletions
379 doc += base_Renamed; // convert to merged space
381 if (doc < lastDoc)
382 throw new System.SystemException("docs out of order");
384 df++;
386 if ((df % skipInterval) == 0)
388 BufferSkip(lastDoc);
391 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
392 lastDoc = doc;
394 int freq = postings.Freq();
395 if (freq == 1)
397 freqOutput.WriteVInt(docCode | 1); // write doc & freq=1
399 else
401 freqOutput.WriteVInt(docCode); // write doc
402 freqOutput.WriteVInt(freq); // write frequency in doc
405 int lastPosition = 0; // write position deltas
406 for (int j = 0; j < freq; j++)
408 int position = postings.NextPosition();
409 proxOutput.WriteVInt(position - lastPosition);
410 lastPosition = position;
414 return df;
417 private RAMOutputStream skipBuffer = new RAMOutputStream();
418 private int lastSkipDoc;
419 private long lastSkipFreqPointer;
420 private long lastSkipProxPointer;
422 private void ResetSkip()
424 skipBuffer.Reset();
425 lastSkipDoc = 0;
426 lastSkipFreqPointer = freqOutput.GetFilePointer();
427 lastSkipProxPointer = proxOutput.GetFilePointer();
430 private void BufferSkip(int doc)
432 long freqPointer = freqOutput.GetFilePointer();
433 long proxPointer = proxOutput.GetFilePointer();
435 skipBuffer.WriteVInt(doc - lastSkipDoc);
436 skipBuffer.WriteVInt((int) (freqPointer - lastSkipFreqPointer));
437 skipBuffer.WriteVInt((int) (proxPointer - lastSkipProxPointer));
439 lastSkipDoc = doc;
440 lastSkipFreqPointer = freqPointer;
441 lastSkipProxPointer = proxPointer;
444 private long WriteSkip()
446 long skipPointer = freqOutput.GetFilePointer();
447 skipBuffer.WriteTo(freqOutput);
448 return skipPointer;
451 private void MergeNorms()
453 for (int i = 0; i < fieldInfos.Size(); i++)
455 FieldInfo fi = fieldInfos.FieldInfo(i);
456 if (fi.isIndexed && !fi.omitNorms)
458 IndexOutput output = directory.CreateOutput(segment + ".f" + i);
461 for (int j = 0; j < readers.Count; j++)
463 IndexReader reader = (IndexReader) readers[j];
464 int maxDoc = reader.MaxDoc();
465 byte[] input = new byte[maxDoc];
466 reader.Norms(fi.name, input, 0);
467 for (int k = 0; k < maxDoc; k++)
469 if (!reader.IsDeleted(k))
471 output.WriteByte(input[k]);
476 finally
478 output.Close();