2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 using Directory
= Lucene
.Net
.Store
.Directory
;
19 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
20 using RAMOutputStream
= Lucene
.Net
.Store
.RAMOutputStream
;
22 namespace Lucene
.Net
.Index
25 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
26 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
29 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
33 /// <seealso cref="merge">
35 /// <seealso cref="add">
37 public sealed class SegmentMerger
39 private void InitBlock()
41 termIndexInterval
= IndexWriter
.DEFAULT_TERM_INDEX_INTERVAL
;
43 private Directory directory
;
44 private System
.String segment
;
45 private int termIndexInterval
;
47 private System
.Collections
.ArrayList readers
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
48 private FieldInfos fieldInfos
;
50 /// <summary>This ctor used only by test code.
53 /// <param name="dir">The Directory to merge the other segments into
55 /// <param name="name">The name of the new segment
57 public /*internal*/ SegmentMerger(Directory dir
, System
.String name
)
64 internal SegmentMerger(IndexWriter writer
, System
.String name
)
67 directory
= writer
.GetDirectory();
69 termIndexInterval
= writer
.GetTermIndexInterval();
72 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
73 /// <param name="reader">
75 public /*internal*/ void Add(IndexReader reader
)
80 /// <summary> </summary>
81 /// <param name="i">The index of the reader to return
83 /// <returns> The ith reader to be merged
85 internal IndexReader
SegmentReader(int i
)
87 return (IndexReader
) readers
[i
];
90 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
91 /// <returns> The number of documents that were merged
93 /// <throws> IOException </throws>
94 public /*internal*/ int Merge()
98 value_Renamed
= MergeFields();
102 if (fieldInfos
.HasVectors())
105 return value_Renamed
;
108 /// <summary> close all IndexReaders that have been added.
109 /// Should not be called before merge().
111 /// <throws> IOException </throws>
112 public /*internal*/ void CloseReaders()
114 for (int i
= 0; i
< readers
.Count
; i
++)
117 IndexReader reader
= (IndexReader
) readers
[i
];
122 public System
.Collections
.ArrayList
CreateCompoundFile(System
.String fileName
)
124 CompoundFileWriter cfsWriter
= new CompoundFileWriter(directory
, fileName
);
126 System
.Collections
.ArrayList files
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(IndexFileNames
.COMPOUND_EXTENSIONS
.Length
+ fieldInfos
.Size()));
129 for (int i
= 0; i
< IndexFileNames
.COMPOUND_EXTENSIONS
.Length
; i
++)
131 files
.Add(segment
+ "." + IndexFileNames
.COMPOUND_EXTENSIONS
[i
]);
135 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
137 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
138 if (fi
.isIndexed
&& !fi
.omitNorms
)
140 files
.Add(segment
+ ".f" + i
);
145 if (fieldInfos
.HasVectors())
147 for (int i
= 0; i
< IndexFileNames
.VECTOR_EXTENSIONS
.Length
; i
++)
149 files
.Add(segment
+ "." + IndexFileNames
.VECTOR_EXTENSIONS
[i
]);
153 // Now merge all added files
154 System
.Collections
.IEnumerator it
= files
.GetEnumerator();
155 while (it
.MoveNext())
157 cfsWriter
.AddFile((System
.String
) it
.Current
);
166 private void AddIndexed(IndexReader reader
, FieldInfos fieldInfos
, System
.Collections
.ICollection names
, bool storeTermVectors
, bool storePositionWithTermVector
, bool storeOffsetWithTermVector
)
168 System
.Collections
.IEnumerator i
= names
.GetEnumerator();
171 System
.Collections
.DictionaryEntry e
= (System
.Collections
.DictionaryEntry
) i
.Current
;
172 System
.String field
= (System
.String
) e
.Key
;
173 fieldInfos
.Add(field
, true, storeTermVectors
, storePositionWithTermVector
, storeOffsetWithTermVector
, !reader
.HasNorms(field
));
177 /// <summary> </summary>
178 /// <returns> The number of documents in all of the readers
180 /// <throws> IOException </throws>
181 private int MergeFields()
183 fieldInfos
= new FieldInfos(); // merge field names
185 for (int i
= 0; i
< readers
.Count
; i
++)
187 IndexReader reader
= (IndexReader
) readers
[i
];
188 AddIndexed(reader
, fieldInfos
, reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_POSITION_OFFSET
), true, true, true);
189 AddIndexed(reader
, fieldInfos
, reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_POSITION
), true, true, false);
190 AddIndexed(reader
, fieldInfos
, reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_OFFSET
), true, false, true);
191 AddIndexed(reader
, fieldInfos
, reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR
), true, false, false);
192 AddIndexed(reader
, fieldInfos
, reader
.GetFieldNames(IndexReader
.FieldOption
.INDEXED
), false, false, false);
193 fieldInfos
.Add(reader
.GetFieldNames(IndexReader
.FieldOption
.UNINDEXED
), false);
195 fieldInfos
.Write(directory
, segment
+ ".fnm");
197 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
200 for (int i
= 0; i
< readers
.Count
; i
++)
202 IndexReader reader
= (IndexReader
) readers
[i
];
203 int maxDoc
= reader
.MaxDoc();
204 for (int j
= 0; j
< maxDoc
; j
++)
205 if (!reader
.IsDeleted(j
))
208 fieldsWriter
.AddDocument(reader
.Document(j
));
215 fieldsWriter
.Close();
220 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
221 /// <throws> IOException </throws>
222 private void MergeVectors()
224 TermVectorsWriter termVectorsWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
228 for (int r
= 0; r
< readers
.Count
; r
++)
230 IndexReader reader
= (IndexReader
) readers
[r
];
231 int maxDoc
= reader
.MaxDoc();
232 for (int docNum
= 0; docNum
< maxDoc
; docNum
++)
235 if (reader
.IsDeleted(docNum
))
237 termVectorsWriter
.AddAllDocVectors(reader
.GetTermFreqVectors(docNum
));
243 termVectorsWriter
.Close();
247 private IndexOutput freqOutput
= null;
248 private IndexOutput proxOutput
= null;
249 private TermInfosWriter termInfosWriter
= null;
250 private int skipInterval
;
251 private SegmentMergeQueue queue
= null;
253 private void MergeTerms()
257 freqOutput
= directory
.CreateOutput(segment
+ ".frq");
258 proxOutput
= directory
.CreateOutput(segment
+ ".prx");
259 termInfosWriter
= new TermInfosWriter(directory
, segment
, fieldInfos
, termIndexInterval
);
260 skipInterval
= termInfosWriter
.skipInterval
;
261 queue
= new SegmentMergeQueue(readers
.Count
);
267 if (freqOutput
!= null)
269 if (proxOutput
!= null)
271 if (termInfosWriter
!= null)
272 termInfosWriter
.Close();
278 private void MergeTermInfos()
280 int base_Renamed
= 0;
281 for (int i
= 0; i
< readers
.Count
; i
++)
283 IndexReader reader
= (IndexReader
) readers
[i
];
284 TermEnum termEnum
= reader
.Terms();
285 SegmentMergeInfo smi
= new SegmentMergeInfo(base_Renamed
, termEnum
, reader
);
286 base_Renamed
+= reader
.NumDocs();
294 SegmentMergeInfo
[] match
= new SegmentMergeInfo
[readers
.Count
];
296 while (queue
.Size() > 0)
298 int matchSize
= 0; // pop matching terms
299 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
300 Term term
= match
[0].term
;
301 SegmentMergeInfo top
= (SegmentMergeInfo
) queue
.Top();
303 while (top
!= null && term
.CompareTo(top
.term
) == 0)
305 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
306 top
= (SegmentMergeInfo
) queue
.Top();
309 MergeTermInfo(match
, matchSize
); // add new TermInfo
311 while (matchSize
> 0)
313 SegmentMergeInfo smi
= match
[--matchSize
];
318 smi
.Close(); // done with a segment
323 private TermInfo termInfo
= new TermInfo(); // minimize consing
325 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
326 /// contains segments that are positioned at the same term. <code>N</code>
327 /// is the number of cells in the array actually occupied.
330 /// <param name="smis">array of segments
332 /// <param name="n">number of cells in the array actually occupied
334 private void MergeTermInfo(SegmentMergeInfo
[] smis
, int n
)
336 long freqPointer
= freqOutput
.GetFilePointer();
337 long proxPointer
= proxOutput
.GetFilePointer();
339 int df
= AppendPostings(smis
, n
); // append posting data
341 long skipPointer
= WriteSkip();
345 // add an entry to the dictionary with pointers to prox and freq files
346 termInfo
.Set(df
, freqPointer
, proxPointer
, (int) (skipPointer
- freqPointer
));
347 termInfosWriter
.Add(smis
[0].term
, termInfo
);
351 /// <summary>Process postings from multiple segments all positioned on the
352 /// same term. Writes out merged entries into freqOutput and
353 /// the proxOutput streams.
356 /// <param name="smis">array of segments
358 /// <param name="n">number of cells in the array actually occupied
360 /// <returns> number of documents across all segments where this term was found
362 private int AppendPostings(SegmentMergeInfo
[] smis
, int n
)
365 int df
= 0; // number of docs w/ term
367 for (int i
= 0; i
< n
; i
++)
369 SegmentMergeInfo smi
= smis
[i
];
370 TermPositions postings
= smi
.GetPositions();
371 int base_Renamed
= smi
.base_Renamed
;
372 int[] docMap
= smi
.GetDocMap();
373 postings
.Seek(smi
.termEnum
);
374 while (postings
.Next())
376 int doc
= postings
.Doc();
378 doc
= docMap
[doc
]; // map around deletions
379 doc
+= base_Renamed
; // convert to merged space
382 throw new System
.SystemException("docs out of order");
386 if ((df
% skipInterval
) == 0)
391 int docCode
= (doc
- lastDoc
) << 1; // use low bit to flag freq=1
394 int freq
= postings
.Freq();
397 freqOutput
.WriteVInt(docCode
| 1); // write doc & freq=1
401 freqOutput
.WriteVInt(docCode
); // write doc
402 freqOutput
.WriteVInt(freq
); // write frequency in doc
405 int lastPosition
= 0; // write position deltas
406 for (int j
= 0; j
< freq
; j
++)
408 int position
= postings
.NextPosition();
409 proxOutput
.WriteVInt(position
- lastPosition
);
410 lastPosition
= position
;
417 private RAMOutputStream skipBuffer
= new RAMOutputStream();
418 private int lastSkipDoc
;
419 private long lastSkipFreqPointer
;
420 private long lastSkipProxPointer
;
422 private void ResetSkip()
426 lastSkipFreqPointer
= freqOutput
.GetFilePointer();
427 lastSkipProxPointer
= proxOutput
.GetFilePointer();
430 private void BufferSkip(int doc
)
432 long freqPointer
= freqOutput
.GetFilePointer();
433 long proxPointer
= proxOutput
.GetFilePointer();
435 skipBuffer
.WriteVInt(doc
- lastSkipDoc
);
436 skipBuffer
.WriteVInt((int) (freqPointer
- lastSkipFreqPointer
));
437 skipBuffer
.WriteVInt((int) (proxPointer
- lastSkipProxPointer
));
440 lastSkipFreqPointer
= freqPointer
;
441 lastSkipProxPointer
= proxPointer
;
444 private long WriteSkip()
446 long skipPointer
= freqOutput
.GetFilePointer();
447 skipBuffer
.WriteTo(freqOutput
);
451 private void MergeNorms()
453 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
455 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
456 if (fi
.isIndexed
&& !fi
.omitNorms
)
458 IndexOutput output
= directory
.CreateOutput(segment
+ ".f" + i
);
461 for (int j
= 0; j
< readers
.Count
; j
++)
463 IndexReader reader
= (IndexReader
) readers
[j
];
464 int maxDoc
= reader
.MaxDoc();
465 byte[] input
= new byte[maxDoc
];
466 reader
.Norms(fi
.name
, input
, 0);
467 for (int k
= 0; k
< maxDoc
; k
++)
469 if (!reader
.IsDeleted(k
))
471 output
.WriteByte(input
[k
]);