Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Index / SegmentMerger.cs
blob70c66e7e63503f72b1b3acaefcd59462cbe9494c
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Directory = Lucene.Net.Store.Directory;
18 using OutputStream = Lucene.Net.Store.OutputStream;
19 using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
20 namespace Lucene.Net.Index
23 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
24 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
25 /// segments.
26 /// <P>
27 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
28 ///
29 ///
30 /// </summary>
31 /// <seealso cref="#merge">
32 /// </seealso>
33 /// <seealso cref="#add">
34 /// </seealso>
35 sealed public class SegmentMerger
37 private bool useCompoundFile;
38 private Directory directory;
39 private System.String segment;
41 private System.Collections.ArrayList readers = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
42 private FieldInfos fieldInfos;
44 // File extensions of old-style index files
45 private static readonly System.String[] COMPOUND_EXTENSIONS = new System.String[]{"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"};
46 private static readonly System.String[] VECTOR_EXTENSIONS = new System.String[]{"tvx", "tvd", "tvf"};
48 /// <summary> </summary>
49 /// <param name="dir">The Directory to merge the other segments into
50 /// </param>
51 /// <param name="name">The name of the new segment
52 /// </param>
53 /// <param name="compoundFile">true if the new segment should use a compoundFile
54 /// </param>
55 public /*internal*/ SegmentMerger(Directory dir, System.String name, bool compoundFile)
57 directory = dir;
58 segment = name;
59 useCompoundFile = compoundFile;
62 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
63 /// <param name="">reader
64 /// </param>
65 public /*internal*/ void Add(IndexReader reader)
67 readers.Add(reader);
70 /// <summary> </summary>
71 /// <param name="i">The index of the reader to return
72 /// </param>
73 /// <returns> The ith reader to be merged
74 /// </returns>
75 internal IndexReader SegmentReader(int i)
77 return (IndexReader) readers[i];
80 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
81 /// <returns> The number of documents that were merged
82 /// </returns>
83 /// <throws> IOException </throws>
84 public /*internal*/ int Merge()
86 int value_Renamed;
88 value_Renamed = MergeFields();
89 MergeTerms();
90 MergeNorms();
92 if (fieldInfos.HasVectors())
93 MergeVectors();
95 if (useCompoundFile)
96 CreateCompoundFile();
98 return value_Renamed;
101 /// <summary> close all IndexReaders that have been added.
102 /// Should not be called before merge().
103 /// </summary>
104 /// <throws> IOException </throws>
105 public /*internal*/ void CloseReaders()
107 for (int i = 0; i < readers.Count; i++)
109 // close readers
110 IndexReader reader = (IndexReader) readers[i];
111 reader.Close();
115 private void CreateCompoundFile()
117 CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + ".cfs");
119 System.Collections.ArrayList files = new System.Collections.ArrayList(COMPOUND_EXTENSIONS.Length + fieldInfos.Size());
121 // Basic files
122 for (int i = 0; i < COMPOUND_EXTENSIONS.Length; i++)
124 files.Add(segment + "." + COMPOUND_EXTENSIONS[i]);
127 // Field norm files
128 for (int i = 0; i < fieldInfos.Size(); i++)
130 FieldInfo fi = fieldInfos.FieldInfo(i);
131 if (fi.isIndexed)
133 files.Add(segment + ".f" + i);
137 // Vector files
138 if (fieldInfos.HasVectors())
140 for (int i = 0; i < VECTOR_EXTENSIONS.Length; i++)
142 files.Add(segment + "." + VECTOR_EXTENSIONS[i]);
146 // Now merge all added files
147 System.Collections.IEnumerator it = files.GetEnumerator();
148 while (it.MoveNext())
150 cfsWriter.AddFile((System.String) it.Current);
153 // Perform the merge
154 cfsWriter.Close();
156 // Now delete the source files
157 it = files.GetEnumerator();
158 while (it.MoveNext())
160 directory.DeleteFile((System.String) it.Current);
164 /// <summary> </summary>
165 /// <returns> The number of documents in all of the readers
166 /// </returns>
167 /// <throws> IOException </throws>
168 private int MergeFields()
170 fieldInfos = new FieldInfos(); // merge Field names
171 int docCount = 0;
172 for (int i = 0; i < readers.Count; i++)
174 IndexReader reader = (IndexReader) readers[i];
175 fieldInfos.AddIndexed(reader.GetIndexedFieldNames(true), true);
176 fieldInfos.AddIndexed(reader.GetIndexedFieldNames(false), false);
177 fieldInfos.Add(reader.GetFieldNames(false), false);
179 fieldInfos.Write(directory, segment + ".fnm");
181 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
184 for (int i = 0; i < readers.Count; i++)
186 IndexReader reader = (IndexReader) readers[i];
187 int maxDoc = reader.MaxDoc();
188 for (int j = 0; j < maxDoc; j++)
189 if (!reader.IsDeleted(j))
191 // skip deleted docs
192 fieldsWriter.AddDocument(reader.Document(j));
193 docCount++;
197 finally
199 fieldsWriter.Close();
201 return docCount;
204 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
205 /// <throws> IOException </throws>
206 private void MergeVectors()
208 TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
212 for (int r = 0; r < readers.Count; r++)
214 IndexReader reader = (IndexReader) readers[r];
215 int maxDoc = reader.MaxDoc();
216 for (int docNum = 0; docNum < maxDoc; docNum++)
218 // skip deleted docs
219 if (reader.IsDeleted(docNum))
221 continue;
223 termVectorsWriter.OpenDocument();
225 // get all term vectors
226 TermFreqVector[] sourceTermVector = reader.GetTermFreqVectors(docNum);
228 if (sourceTermVector != null)
230 for (int f = 0; f < sourceTermVector.Length; f++)
232 // translate Field numbers
233 TermFreqVector termVector = sourceTermVector[f];
234 termVectorsWriter.OpenField(termVector.GetField());
235 System.String[] terms = termVector.GetTerms();
236 int[] freqs = termVector.GetTermFrequencies();
238 for (int t = 0; t < terms.Length; t++)
240 termVectorsWriter.AddTerm(terms[t], freqs[t]);
243 termVectorsWriter.CloseDocument();
248 finally
250 termVectorsWriter.Close();
254 private OutputStream freqOutput = null;
255 private OutputStream proxOutput = null;
256 private TermInfosWriter termInfosWriter = null;
257 private int skipInterval;
258 private SegmentMergeQueue queue = null;
260 private void MergeTerms()
264 freqOutput = directory.CreateFile(segment + ".frq");
265 proxOutput = directory.CreateFile(segment + ".prx");
266 termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos);
267 skipInterval = termInfosWriter.skipInterval;
268 queue = new SegmentMergeQueue(readers.Count);
270 MergeTermInfos();
272 finally
274 if (freqOutput != null)
275 freqOutput.Close();
276 if (proxOutput != null)
277 proxOutput.Close();
278 if (termInfosWriter != null)
279 termInfosWriter.Close();
280 if (queue != null)
281 queue.Close();
285 private void MergeTermInfos()
287 int base_Renamed = 0;
288 for (int i = 0; i < readers.Count; i++)
290 IndexReader reader = (IndexReader) readers[i];
291 TermEnum termEnum = reader.Terms();
292 SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader);
293 base_Renamed += reader.NumDocs();
294 if (smi.Next())
295 queue.Put(smi);
296 // initialize queue
297 else
298 smi.Close();
301 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];
303 while (queue.Size() > 0)
305 int matchSize = 0; // pop matching terms
306 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
307 Term term = match[0].term;
308 SegmentMergeInfo top = (SegmentMergeInfo) queue.Top();
310 while (top != null && term.CompareTo(top.term) == 0)
312 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
313 top = (SegmentMergeInfo) queue.Top();
316 MergeTermInfo(match, matchSize); // add new TermInfo
318 while (matchSize > 0)
320 SegmentMergeInfo smi = match[--matchSize];
321 if (smi.Next())
322 queue.Put(smi);
323 // restore queue
324 else
325 smi.Close(); // done with a segment
330 private TermInfo termInfo = new TermInfo(); // minimize consing
332 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
333 /// contains segments that are positioned at the same term. <code>N</code>
334 /// is the number of cells in the array actually occupied.
335 ///
336 /// </summary>
337 /// <param name="smis">array of segments
338 /// </param>
339 /// <param name="n">number of cells in the array actually occupied
340 /// </param>
341 private void MergeTermInfo(SegmentMergeInfo[] smis, int n)
343 long freqPointer = freqOutput.GetFilePointer();
344 long proxPointer = proxOutput.GetFilePointer();
346 int df = AppendPostings(smis, n); // append posting data
348 long skipPointer = WriteSkip();
350 if (df > 0)
352 // add an entry to the dictionary with pointers to prox and freq files
353 termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
354 termInfosWriter.Add(smis[0].term, termInfo);
358 /// <summary>Process postings from multiple segments all positioned on the
359 /// same term. Writes out merged entries into freqOutput and
360 /// the proxOutput streams.
361 ///
362 /// </summary>
363 /// <param name="smis">array of segments
364 /// </param>
365 /// <param name="n">number of cells in the array actually occupied
366 /// </param>
367 /// <returns> number of documents across all segments where this term was found
368 /// </returns>
369 private int AppendPostings(SegmentMergeInfo[] smis, int n)
371 int lastDoc = 0;
372 int df = 0; // number of docs w/ term
373 ResetSkip();
374 for (int i = 0; i < n; i++)
376 SegmentMergeInfo smi = smis[i];
377 TermPositions postings = smi.postings;
378 int base_Renamed = smi.base_Renamed;
379 int[] docMap = smi.docMap;
380 postings.Seek(smi.termEnum);
381 while (postings.Next())
383 int doc = postings.Doc();
384 if (docMap != null)
385 doc = docMap[doc]; // map around deletions
386 doc += base_Renamed; // convert to merged space
388 if (doc < lastDoc)
389 throw new System.SystemException("docs out of order");
391 df++;
393 if ((df % skipInterval) == 0)
395 BufferSkip(lastDoc);
398 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
399 lastDoc = doc;
401 int freq = postings.Freq();
402 if (freq == 1)
404 freqOutput.WriteVInt(docCode | 1); // write doc & freq=1
406 else
408 freqOutput.WriteVInt(docCode); // write doc
409 freqOutput.WriteVInt(freq); // write frequency in doc
412 int lastPosition = 0; // write position deltas
413 for (int j = 0; j < freq; j++)
415 int position = postings.NextPosition();
416 proxOutput.WriteVInt(position - lastPosition);
417 lastPosition = position;
421 return df;
424 private RAMOutputStream skipBuffer = new RAMOutputStream();
425 private int lastSkipDoc;
426 private long lastSkipFreqPointer;
427 private long lastSkipProxPointer;
429 private void ResetSkip()
431 skipBuffer.Leset();
432 lastSkipDoc = 0;
433 lastSkipFreqPointer = freqOutput.GetFilePointer();
434 lastSkipProxPointer = proxOutput.GetFilePointer();
437 private void BufferSkip(int doc)
439 long freqPointer = freqOutput.GetFilePointer();
440 long proxPointer = proxOutput.GetFilePointer();
442 skipBuffer.WriteVInt(doc - lastSkipDoc);
443 skipBuffer.WriteVInt((int) (freqPointer - lastSkipFreqPointer));
444 skipBuffer.WriteVInt((int) (proxPointer - lastSkipProxPointer));
446 lastSkipDoc = doc;
447 lastSkipFreqPointer = freqPointer;
448 lastSkipProxPointer = proxPointer;
451 private long WriteSkip()
453 long skipPointer = freqOutput.GetFilePointer();
454 skipBuffer.WriteTo(freqOutput);
455 return skipPointer;
458 private void MergeNorms()
460 for (int i = 0; i < fieldInfos.Size(); i++)
462 FieldInfo fi = fieldInfos.FieldInfo(i);
463 if (fi.isIndexed)
465 OutputStream output = directory.CreateFile(segment + ".f" + i);
468 for (int j = 0; j < readers.Count; j++)
470 IndexReader reader = (IndexReader) readers[j];
471 byte[] input = reader.Norms(fi.name);
472 int maxDoc = reader.MaxDoc();
473 for (int k = 0; k < maxDoc; k++)
475 byte norm = input != null?input[k]:(byte) 0;
476 if (!reader.IsDeleted(k))
478 output.WriteByte(norm);
483 finally
485 output.Close();