Update the thread-local storage patch, to fix #335178
[beagle.git] / beagled / Lucene.Net / Index / SegmentMerger.cs
blob9f70d756fb7162c58cfae99c1d86844ed411d036
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Directory = Lucene.Net.Store.Directory;
18 using IndexOutput = Lucene.Net.Store.IndexOutput;
19 using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
20 namespace Lucene.Net.Index
23 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
24 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
25 /// segments.
26 /// <P>
27 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
28 ///
29 ///
30 /// </summary>
31 /// <seealso cref="#merge">
32 /// </seealso>
33 /// <seealso cref="#add">
34 /// </seealso>
35 sealed public class SegmentMerger
37 private void InitBlock()
39 termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
41 private Directory directory;
42 private System.String segment;
43 private int termIndexInterval;
45 private System.Collections.ArrayList readers = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
46 private FieldInfos fieldInfos;
48 // File extensions of old-style index files
49 private static readonly System.String[] COMPOUND_EXTENSIONS = new System.String[]{"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"};
50 private static readonly System.String[] VECTOR_EXTENSIONS = new System.String[]{"tvx", "tvd", "tvf"};
52 /// <summary>This ctor used only by test code.
53 ///
54 /// </summary>
55 /// <param name="dir">The Directory to merge the other segments into
56 /// </param>
57 /// <param name="name">The name of the new segment
58 /// </param>
59 public /*internal*/ SegmentMerger(Directory dir, System.String name)
61 InitBlock();
62 directory = dir;
63 segment = name;
66 internal SegmentMerger(IndexWriter writer, System.String name)
68 InitBlock();
69 directory = writer.GetDirectory();
70 segment = name;
71 termIndexInterval = writer.GetTermIndexInterval();
74 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
75 /// <param name="">reader
76 /// </param>
77 public /*internal*/ void Add(IndexReader reader)
79 readers.Add(reader);
82 /// <summary> </summary>
83 /// <param name="i">The index of the reader to return
84 /// </param>
85 /// <returns> The ith reader to be merged
86 /// </returns>
87 internal IndexReader SegmentReader(int i)
89 return (IndexReader) readers[i];
92 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
93 /// <returns> The number of documents that were merged
94 /// </returns>
95 /// <throws> IOException </throws>
96 public /*internal*/ int Merge()
98 int value_Renamed;
100 value_Renamed = MergeFields();
101 MergeTerms();
102 MergeNorms();
104 if (fieldInfos.HasVectors())
105 MergeVectors();
107 return value_Renamed;
110 /// <summary> close all IndexReaders that have been added.
111 /// Should not be called before merge().
112 /// </summary>
113 /// <throws> IOException </throws>
114 public /*internal*/ void CloseReaders()
116 for (int i = 0; i < readers.Count; i++)
118 // close readers
119 IndexReader reader = (IndexReader) readers[i];
120 reader.Close();
124 internal System.Collections.ArrayList CreateCompoundFile(System.String fileName)
126 CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName);
128 System.Collections.ArrayList files = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(COMPOUND_EXTENSIONS.Length + fieldInfos.Size()));
130 // Basic files
131 for (int i = 0; i < COMPOUND_EXTENSIONS.Length; i++)
133 files.Add(segment + "." + COMPOUND_EXTENSIONS[i]);
136 // Field norm files
137 for (int i = 0; i < fieldInfos.Size(); i++)
139 FieldInfo fi = fieldInfos.FieldInfo(i);
140 if (fi.isIndexed)
142 files.Add(segment + ".f" + i);
146 // Vector files
147 if (fieldInfos.HasVectors())
149 for (int i = 0; i < VECTOR_EXTENSIONS.Length; i++)
151 files.Add(segment + "." + VECTOR_EXTENSIONS[i]);
155 // Now merge all added files
156 System.Collections.IEnumerator it = files.GetEnumerator();
157 while (it.MoveNext())
159 cfsWriter.AddFile((System.String) it.Current);
162 // Perform the merge
163 cfsWriter.Close();
165 return files;
168 /// <summary> </summary>
169 /// <returns> The number of documents in all of the readers
170 /// </returns>
171 /// <throws> IOException </throws>
172 private int MergeFields()
174 fieldInfos = new FieldInfos(); // merge Field names
175 int docCount = 0;
176 for (int i = 0; i < readers.Count; i++)
178 IndexReader reader = (IndexReader) readers[i];
179 fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
180 fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
181 fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
182 fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
183 fieldInfos.AddIndexed(reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
184 fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false);
186 fieldInfos.Write(directory, segment + ".fnm");
188 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
191 for (int i = 0; i < readers.Count; i++)
193 IndexReader reader = (IndexReader) readers[i];
194 int maxDoc = reader.MaxDoc();
195 for (int j = 0; j < maxDoc; j++)
196 if (!reader.IsDeleted(j))
198 // skip deleted docs
199 fieldsWriter.AddDocument(reader.Document(j));
200 docCount++;
204 finally
206 fieldsWriter.Close();
208 return docCount;
211 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
212 /// <throws> IOException </throws>
213 private void MergeVectors()
215 TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
219 for (int r = 0; r < readers.Count; r++)
221 IndexReader reader = (IndexReader) readers[r];
222 int maxDoc = reader.MaxDoc();
223 for (int docNum = 0; docNum < maxDoc; docNum++)
225 // skip deleted docs
226 if (reader.IsDeleted(docNum))
227 continue;
228 termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
232 finally
234 termVectorsWriter.Close();
238 private IndexOutput freqOutput = null;
239 private IndexOutput proxOutput = null;
240 private TermInfosWriter termInfosWriter = null;
241 private int skipInterval;
242 private SegmentMergeQueue queue = null;
244 private void MergeTerms()
248 freqOutput = directory.CreateOutput(segment + ".frq");
249 proxOutput = directory.CreateOutput(segment + ".prx");
250 termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
251 skipInterval = termInfosWriter.skipInterval;
252 queue = new SegmentMergeQueue(readers.Count);
254 MergeTermInfos();
256 finally
258 if (freqOutput != null)
259 freqOutput.Close();
260 if (proxOutput != null)
261 proxOutput.Close();
262 if (termInfosWriter != null)
263 termInfosWriter.Close();
264 if (queue != null)
265 queue.Close();
269 private void MergeTermInfos()
271 int base_Renamed = 0;
272 for (int i = 0; i < readers.Count; i++)
274 IndexReader reader = (IndexReader) readers[i];
275 TermEnum termEnum = reader.Terms();
276 SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader);
277 base_Renamed += reader.NumDocs();
278 if (smi.Next())
279 queue.Put(smi);
280 // initialize queue
281 else
282 smi.Close();
285 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];
287 while (queue.Size() > 0)
289 int matchSize = 0; // pop matching terms
290 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
291 Term term = match[0].term;
292 SegmentMergeInfo top = (SegmentMergeInfo) queue.Top();
294 while (top != null && term.CompareTo(top.term) == 0)
296 match[matchSize++] = (SegmentMergeInfo) queue.Pop();
297 top = (SegmentMergeInfo) queue.Top();
300 MergeTermInfo(match, matchSize); // add new TermInfo
302 while (matchSize > 0)
304 SegmentMergeInfo smi = match[--matchSize];
305 if (smi.Next())
306 queue.Put(smi);
307 // restore queue
308 else
309 smi.Close(); // done with a segment
314 private TermInfo termInfo = new TermInfo(); // minimize consing
316 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
317 /// contains segments that are positioned at the same term. <code>N</code>
318 /// is the number of cells in the array actually occupied.
319 ///
320 /// </summary>
321 /// <param name="smis">array of segments
322 /// </param>
323 /// <param name="n">number of cells in the array actually occupied
324 /// </param>
325 private void MergeTermInfo(SegmentMergeInfo[] smis, int n)
327 long freqPointer = freqOutput.GetFilePointer();
328 long proxPointer = proxOutput.GetFilePointer();
330 int df = AppendPostings(smis, n); // append posting data
332 long skipPointer = WriteSkip();
334 if (df > 0)
336 // add an entry to the dictionary with pointers to prox and freq files
337 termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
338 termInfosWriter.Add(smis[0].term, termInfo);
342 /// <summary>Process postings from multiple segments all positioned on the
343 /// same term. Writes out merged entries into freqOutput and
344 /// the proxOutput streams.
345 ///
346 /// </summary>
347 /// <param name="smis">array of segments
348 /// </param>
349 /// <param name="n">number of cells in the array actually occupied
350 /// </param>
351 /// <returns> number of documents across all segments where this term was found
352 /// </returns>
353 private int AppendPostings(SegmentMergeInfo[] smis, int n)
355 int lastDoc = 0;
356 int df = 0; // number of docs w/ term
357 ResetSkip();
358 for (int i = 0; i < n; i++)
360 SegmentMergeInfo smi = smis[i];
361 TermPositions postings = smi.postings;
362 int base_Renamed = smi.base_Renamed;
363 int[] docMap = smi.docMap;
364 postings.Seek(smi.termEnum);
365 while (postings.Next())
367 int doc = postings.Doc();
368 if (docMap != null)
369 doc = docMap[doc]; // map around deletions
370 doc += base_Renamed; // convert to merged space
372 if (doc < lastDoc)
373 throw new System.SystemException("docs out of order");
375 df++;
377 if ((df % skipInterval) == 0)
379 BufferSkip(lastDoc);
382 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
383 lastDoc = doc;
385 int freq = postings.Freq();
386 if (freq == 1)
388 freqOutput.WriteVInt(docCode | 1); // write doc & freq=1
390 else
392 freqOutput.WriteVInt(docCode); // write doc
393 freqOutput.WriteVInt(freq); // write frequency in doc
396 int lastPosition = 0; // write position deltas
397 for (int j = 0; j < freq; j++)
399 int position = postings.NextPosition();
400 proxOutput.WriteVInt(position - lastPosition);
401 lastPosition = position;
405 return df;
408 private RAMOutputStream skipBuffer = new RAMOutputStream();
409 private int lastSkipDoc;
410 private long lastSkipFreqPointer;
411 private long lastSkipProxPointer;
413 private void ResetSkip()
415 skipBuffer.Reset();
416 lastSkipDoc = 0;
417 lastSkipFreqPointer = freqOutput.GetFilePointer();
418 lastSkipProxPointer = proxOutput.GetFilePointer();
421 private void BufferSkip(int doc)
423 long freqPointer = freqOutput.GetFilePointer();
424 long proxPointer = proxOutput.GetFilePointer();
426 skipBuffer.WriteVInt(doc - lastSkipDoc);
427 skipBuffer.WriteVInt((int) (freqPointer - lastSkipFreqPointer));
428 skipBuffer.WriteVInt((int) (proxPointer - lastSkipProxPointer));
430 lastSkipDoc = doc;
431 lastSkipFreqPointer = freqPointer;
432 lastSkipProxPointer = proxPointer;
435 private long WriteSkip()
437 long skipPointer = freqOutput.GetFilePointer();
438 skipBuffer.WriteTo(freqOutput);
439 return skipPointer;
442 private void MergeNorms()
444 for (int i = 0; i < fieldInfos.Size(); i++)
446 FieldInfo fi = fieldInfos.FieldInfo(i);
447 if (fi.isIndexed)
449 IndexOutput output = directory.CreateOutput(segment + ".f" + i);
452 for (int j = 0; j < readers.Count; j++)
454 IndexReader reader = (IndexReader) readers[j];
455 int maxDoc = reader.MaxDoc();
456 byte[] input = new byte[maxDoc];
457 reader.Norms(fi.name, input, 0);
458 for (int k = 0; k < maxDoc; k++)
460 if (!reader.IsDeleted(k))
462 output.WriteByte(input[k]);
467 finally
469 output.Close();