2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Directory
= Lucene
.Net
.Store
.Directory
;
18 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
19 using RAMOutputStream
= Lucene
.Net
.Store
.RAMOutputStream
;
20 namespace Lucene
.Net
.Index
23 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
24 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
27 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
31 /// <seealso cref="#merge">
33 /// <seealso cref="#add">
35 sealed public class SegmentMerger
37 private void InitBlock()
39 termIndexInterval
= IndexWriter
.DEFAULT_TERM_INDEX_INTERVAL
;
41 private Directory directory
;
42 private System
.String segment
;
43 private int termIndexInterval
;
45 private System
.Collections
.ArrayList readers
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
46 private FieldInfos fieldInfos
;
48 // File extensions of old-style index files
49 private static readonly System
.String
[] COMPOUND_EXTENSIONS
= new System
.String
[]{"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"}
;
50 private static readonly System
.String
[] VECTOR_EXTENSIONS
= new System
.String
[]{"tvx", "tvd", "tvf"}
;
52 /// <summary>This ctor used only by test code.
55 /// <param name="dir">The Directory to merge the other segments into
57 /// <param name="name">The name of the new segment
59 public /*internal*/ SegmentMerger(Directory dir
, System
.String name
)
66 internal SegmentMerger(IndexWriter writer
, System
.String name
)
69 directory
= writer
.GetDirectory();
71 termIndexInterval
= writer
.GetTermIndexInterval();
74 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
75 /// <param name="">reader
77 public /*internal*/ void Add(IndexReader reader
)
82 /// <summary> </summary>
83 /// <param name="i">The index of the reader to return
85 /// <returns> The ith reader to be merged
87 internal IndexReader
SegmentReader(int i
)
89 return (IndexReader
) readers
[i
];
92 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
93 /// <returns> The number of documents that were merged
95 /// <throws> IOException </throws>
96 public /*internal*/ int Merge()
100 value_Renamed
= MergeFields();
104 if (fieldInfos
.HasVectors())
107 return value_Renamed
;
110 /// <summary> close all IndexReaders that have been added.
111 /// Should not be called before merge().
113 /// <throws> IOException </throws>
114 public /*internal*/ void CloseReaders()
116 for (int i
= 0; i
< readers
.Count
; i
++)
119 IndexReader reader
= (IndexReader
) readers
[i
];
124 internal System
.Collections
.ArrayList
CreateCompoundFile(System
.String fileName
)
126 CompoundFileWriter cfsWriter
= new CompoundFileWriter(directory
, fileName
);
128 System
.Collections
.ArrayList files
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(COMPOUND_EXTENSIONS
.Length
+ fieldInfos
.Size()));
131 for (int i
= 0; i
< COMPOUND_EXTENSIONS
.Length
; i
++)
133 files
.Add(segment
+ "." + COMPOUND_EXTENSIONS
[i
]);
137 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
139 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
142 files
.Add(segment
+ ".f" + i
);
147 if (fieldInfos
.HasVectors())
149 for (int i
= 0; i
< VECTOR_EXTENSIONS
.Length
; i
++)
151 files
.Add(segment
+ "." + VECTOR_EXTENSIONS
[i
]);
155 // Now merge all added files
156 System
.Collections
.IEnumerator it
= files
.GetEnumerator();
157 while (it
.MoveNext())
159 cfsWriter
.AddFile((System
.String
) it
.Current
);
168 /// <summary> </summary>
169 /// <returns> The number of documents in all of the readers
171 /// <throws> IOException </throws>
172 private int MergeFields()
174 fieldInfos
= new FieldInfos(); // merge Field names
176 for (int i
= 0; i
< readers
.Count
; i
++)
178 IndexReader reader
= (IndexReader
) readers
[i
];
179 fieldInfos
.AddIndexed(reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_POSITION_OFFSET
), true, true, true);
180 fieldInfos
.AddIndexed(reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_POSITION
), true, true, false);
181 fieldInfos
.AddIndexed(reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR_WITH_OFFSET
), true, false, true);
182 fieldInfos
.AddIndexed(reader
.GetFieldNames(IndexReader
.FieldOption
.TERMVECTOR
), true, false, false);
183 fieldInfos
.AddIndexed(reader
.GetFieldNames(IndexReader
.FieldOption
.INDEXED
), false, false, false);
184 fieldInfos
.Add(reader
.GetFieldNames(IndexReader
.FieldOption
.UNINDEXED
), false);
186 fieldInfos
.Write(directory
, segment
+ ".fnm");
188 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
191 for (int i
= 0; i
< readers
.Count
; i
++)
193 IndexReader reader
= (IndexReader
) readers
[i
];
194 int maxDoc
= reader
.MaxDoc();
195 for (int j
= 0; j
< maxDoc
; j
++)
196 if (!reader
.IsDeleted(j
))
199 fieldsWriter
.AddDocument(reader
.Document(j
));
206 fieldsWriter
.Close();
211 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
212 /// <throws> IOException </throws>
213 private void MergeVectors()
215 TermVectorsWriter termVectorsWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
219 for (int r
= 0; r
< readers
.Count
; r
++)
221 IndexReader reader
= (IndexReader
) readers
[r
];
222 int maxDoc
= reader
.MaxDoc();
223 for (int docNum
= 0; docNum
< maxDoc
; docNum
++)
226 if (reader
.IsDeleted(docNum
))
228 termVectorsWriter
.AddAllDocVectors(reader
.GetTermFreqVectors(docNum
));
234 termVectorsWriter
.Close();
238 private IndexOutput freqOutput
= null;
239 private IndexOutput proxOutput
= null;
240 private TermInfosWriter termInfosWriter
= null;
241 private int skipInterval
;
242 private SegmentMergeQueue queue
= null;
244 private void MergeTerms()
248 freqOutput
= directory
.CreateOutput(segment
+ ".frq");
249 proxOutput
= directory
.CreateOutput(segment
+ ".prx");
250 termInfosWriter
= new TermInfosWriter(directory
, segment
, fieldInfos
, termIndexInterval
);
251 skipInterval
= termInfosWriter
.skipInterval
;
252 queue
= new SegmentMergeQueue(readers
.Count
);
258 if (freqOutput
!= null)
260 if (proxOutput
!= null)
262 if (termInfosWriter
!= null)
263 termInfosWriter
.Close();
269 private void MergeTermInfos()
271 int base_Renamed
= 0;
272 for (int i
= 0; i
< readers
.Count
; i
++)
274 IndexReader reader
= (IndexReader
) readers
[i
];
275 TermEnum termEnum
= reader
.Terms();
276 SegmentMergeInfo smi
= new SegmentMergeInfo(base_Renamed
, termEnum
, reader
);
277 base_Renamed
+= reader
.NumDocs();
285 SegmentMergeInfo
[] match
= new SegmentMergeInfo
[readers
.Count
];
287 while (queue
.Size() > 0)
289 int matchSize
= 0; // pop matching terms
290 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
291 Term term
= match
[0].term
;
292 SegmentMergeInfo top
= (SegmentMergeInfo
) queue
.Top();
294 while (top
!= null && term
.CompareTo(top
.term
) == 0)
296 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
297 top
= (SegmentMergeInfo
) queue
.Top();
300 MergeTermInfo(match
, matchSize
); // add new TermInfo
302 while (matchSize
> 0)
304 SegmentMergeInfo smi
= match
[--matchSize
];
309 smi
.Close(); // done with a segment
314 private TermInfo termInfo
= new TermInfo(); // minimize consing
316 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
317 /// contains segments that are positioned at the same term. <code>N</code>
318 /// is the number of cells in the array actually occupied.
321 /// <param name="smis">array of segments
323 /// <param name="n">number of cells in the array actually occupied
325 private void MergeTermInfo(SegmentMergeInfo
[] smis
, int n
)
327 long freqPointer
= freqOutput
.GetFilePointer();
328 long proxPointer
= proxOutput
.GetFilePointer();
330 int df
= AppendPostings(smis
, n
); // append posting data
332 long skipPointer
= WriteSkip();
336 // add an entry to the dictionary with pointers to prox and freq files
337 termInfo
.Set(df
, freqPointer
, proxPointer
, (int) (skipPointer
- freqPointer
));
338 termInfosWriter
.Add(smis
[0].term
, termInfo
);
342 /// <summary>Process postings from multiple segments all positioned on the
343 /// same term. Writes out merged entries into freqOutput and
344 /// the proxOutput streams.
347 /// <param name="smis">array of segments
349 /// <param name="n">number of cells in the array actually occupied
351 /// <returns> number of documents across all segments where this term was found
353 private int AppendPostings(SegmentMergeInfo
[] smis
, int n
)
356 int df
= 0; // number of docs w/ term
358 for (int i
= 0; i
< n
; i
++)
360 SegmentMergeInfo smi
= smis
[i
];
361 TermPositions postings
= smi
.postings
;
362 int base_Renamed
= smi
.base_Renamed
;
363 int[] docMap
= smi
.docMap
;
364 postings
.Seek(smi
.termEnum
);
365 while (postings
.Next())
367 int doc
= postings
.Doc();
369 doc
= docMap
[doc
]; // map around deletions
370 doc
+= base_Renamed
; // convert to merged space
373 throw new System
.SystemException("docs out of order");
377 if ((df
% skipInterval
) == 0)
382 int docCode
= (doc
- lastDoc
) << 1; // use low bit to flag freq=1
385 int freq
= postings
.Freq();
388 freqOutput
.WriteVInt(docCode
| 1); // write doc & freq=1
392 freqOutput
.WriteVInt(docCode
); // write doc
393 freqOutput
.WriteVInt(freq
); // write frequency in doc
396 int lastPosition
= 0; // write position deltas
397 for (int j
= 0; j
< freq
; j
++)
399 int position
= postings
.NextPosition();
400 proxOutput
.WriteVInt(position
- lastPosition
);
401 lastPosition
= position
;
408 private RAMOutputStream skipBuffer
= new RAMOutputStream();
409 private int lastSkipDoc
;
410 private long lastSkipFreqPointer
;
411 private long lastSkipProxPointer
;
413 private void ResetSkip()
417 lastSkipFreqPointer
= freqOutput
.GetFilePointer();
418 lastSkipProxPointer
= proxOutput
.GetFilePointer();
421 private void BufferSkip(int doc
)
423 long freqPointer
= freqOutput
.GetFilePointer();
424 long proxPointer
= proxOutput
.GetFilePointer();
426 skipBuffer
.WriteVInt(doc
- lastSkipDoc
);
427 skipBuffer
.WriteVInt((int) (freqPointer
- lastSkipFreqPointer
));
428 skipBuffer
.WriteVInt((int) (proxPointer
- lastSkipProxPointer
));
431 lastSkipFreqPointer
= freqPointer
;
432 lastSkipProxPointer
= proxPointer
;
435 private long WriteSkip()
437 long skipPointer
= freqOutput
.GetFilePointer();
438 skipBuffer
.WriteTo(freqOutput
);
442 private void MergeNorms()
444 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
446 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
449 IndexOutput output
= directory
.CreateOutput(segment
+ ".f" + i
);
452 for (int j
= 0; j
< readers
.Count
; j
++)
454 IndexReader reader
= (IndexReader
) readers
[j
];
455 int maxDoc
= reader
.MaxDoc();
456 byte[] input
= new byte[maxDoc
];
457 reader
.Norms(fi
.name
, input
, 0);
458 for (int k
= 0; k
< maxDoc
; k
++)
460 if (!reader
.IsDeleted(k
))
462 output
.WriteByte(input
[k
]);