2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Directory
= Lucene
.Net
.Store
.Directory
;
18 using OutputStream
= Lucene
.Net
.Store
.OutputStream
;
19 using RAMOutputStream
= Lucene
.Net
.Store
.RAMOutputStream
;
20 namespace Lucene
.Net
.Index
23 /// <summary> The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
24 /// into a single Segment. After adding the appropriate readers, call the merge method to combine the
27 /// If the compoundFile flag is set, then the segments will be merged into a compound file.
31 /// <seealso cref="#merge">
33 /// <seealso cref="#add">
35 sealed public class SegmentMerger
37 private bool useCompoundFile
;
38 private Directory directory
;
39 private System
.String segment
;
41 private System
.Collections
.ArrayList readers
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
42 private FieldInfos fieldInfos
;
44 // File extensions of old-style index files
45 private static readonly System
.String
[] COMPOUND_EXTENSIONS
= new System
.String
[]{"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"}
;
46 private static readonly System
.String
[] VECTOR_EXTENSIONS
= new System
.String
[]{"tvx", "tvd", "tvf"}
;
48 /// <summary> </summary>
49 /// <param name="dir">The Directory to merge the other segments into
51 /// <param name="name">The name of the new segment
53 /// <param name="compoundFile">true if the new segment should use a compoundFile
55 public /*internal*/ SegmentMerger(Directory dir
, System
.String name
, bool compoundFile
)
59 useCompoundFile
= compoundFile
;
62 /// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
63 /// <param name="">reader
65 public /*internal*/ void Add(IndexReader reader
)
70 /// <summary> </summary>
71 /// <param name="i">The index of the reader to return
73 /// <returns> The ith reader to be merged
75 internal IndexReader
SegmentReader(int i
)
77 return (IndexReader
) readers
[i
];
80 /// <summary> Merges the readers specified by the {@link #add} method into the directory passed to the constructor</summary>
81 /// <returns> The number of documents that were merged
83 /// <throws> IOException </throws>
84 public /*internal*/ int Merge()
88 value_Renamed
= MergeFields();
92 if (fieldInfos
.HasVectors())
101 /// <summary> close all IndexReaders that have been added.
102 /// Should not be called before merge().
104 /// <throws> IOException </throws>
105 public /*internal*/ void CloseReaders()
107 for (int i
= 0; i
< readers
.Count
; i
++)
110 IndexReader reader
= (IndexReader
) readers
[i
];
115 private void CreateCompoundFile()
117 CompoundFileWriter cfsWriter
= new CompoundFileWriter(directory
, segment
+ ".cfs");
119 System
.Collections
.ArrayList files
= new System
.Collections
.ArrayList(COMPOUND_EXTENSIONS
.Length
+ fieldInfos
.Size());
122 for (int i
= 0; i
< COMPOUND_EXTENSIONS
.Length
; i
++)
124 files
.Add(segment
+ "." + COMPOUND_EXTENSIONS
[i
]);
128 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
130 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
133 files
.Add(segment
+ ".f" + i
);
138 if (fieldInfos
.HasVectors())
140 for (int i
= 0; i
< VECTOR_EXTENSIONS
.Length
; i
++)
142 files
.Add(segment
+ "." + VECTOR_EXTENSIONS
[i
]);
146 // Now merge all added files
147 System
.Collections
.IEnumerator it
= files
.GetEnumerator();
148 while (it
.MoveNext())
150 cfsWriter
.AddFile((System
.String
) it
.Current
);
156 // Now delete the source files
157 it
= files
.GetEnumerator();
158 while (it
.MoveNext())
160 directory
.DeleteFile((System
.String
) it
.Current
);
164 /// <summary> </summary>
165 /// <returns> The number of documents in all of the readers
167 /// <throws> IOException </throws>
168 private int MergeFields()
170 fieldInfos
= new FieldInfos(); // merge Field names
172 for (int i
= 0; i
< readers
.Count
; i
++)
174 IndexReader reader
= (IndexReader
) readers
[i
];
175 fieldInfos
.AddIndexed(reader
.GetIndexedFieldNames(true), true);
176 fieldInfos
.AddIndexed(reader
.GetIndexedFieldNames(false), false);
177 fieldInfos
.Add(reader
.GetFieldNames(false), false);
179 fieldInfos
.Write(directory
, segment
+ ".fnm");
181 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
184 for (int i
= 0; i
< readers
.Count
; i
++)
186 IndexReader reader
= (IndexReader
) readers
[i
];
187 int maxDoc
= reader
.MaxDoc();
188 for (int j
= 0; j
< maxDoc
; j
++)
189 if (!reader
.IsDeleted(j
))
192 fieldsWriter
.AddDocument(reader
.Document(j
));
199 fieldsWriter
.Close();
204 /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
205 /// <throws> IOException </throws>
206 private void MergeVectors()
208 TermVectorsWriter termVectorsWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
212 for (int r
= 0; r
< readers
.Count
; r
++)
214 IndexReader reader
= (IndexReader
) readers
[r
];
215 int maxDoc
= reader
.MaxDoc();
216 for (int docNum
= 0; docNum
< maxDoc
; docNum
++)
219 if (reader
.IsDeleted(docNum
))
223 termVectorsWriter
.OpenDocument();
225 // get all term vectors
226 TermFreqVector
[] sourceTermVector
= reader
.GetTermFreqVectors(docNum
);
228 if (sourceTermVector
!= null)
230 for (int f
= 0; f
< sourceTermVector
.Length
; f
++)
232 // translate Field numbers
233 TermFreqVector termVector
= sourceTermVector
[f
];
234 termVectorsWriter
.OpenField(termVector
.GetField());
235 System
.String
[] terms
= termVector
.GetTerms();
236 int[] freqs
= termVector
.GetTermFrequencies();
238 for (int t
= 0; t
< terms
.Length
; t
++)
240 termVectorsWriter
.AddTerm(terms
[t
], freqs
[t
]);
243 termVectorsWriter
.CloseDocument();
250 termVectorsWriter
.Close();
254 private OutputStream freqOutput
= null;
255 private OutputStream proxOutput
= null;
256 private TermInfosWriter termInfosWriter
= null;
257 private int skipInterval
;
258 private SegmentMergeQueue queue
= null;
260 private void MergeTerms()
264 freqOutput
= directory
.CreateFile(segment
+ ".frq");
265 proxOutput
= directory
.CreateFile(segment
+ ".prx");
266 termInfosWriter
= new TermInfosWriter(directory
, segment
, fieldInfos
);
267 skipInterval
= termInfosWriter
.skipInterval
;
268 queue
= new SegmentMergeQueue(readers
.Count
);
274 if (freqOutput
!= null)
276 if (proxOutput
!= null)
278 if (termInfosWriter
!= null)
279 termInfosWriter
.Close();
285 private void MergeTermInfos()
287 int base_Renamed
= 0;
288 for (int i
= 0; i
< readers
.Count
; i
++)
290 IndexReader reader
= (IndexReader
) readers
[i
];
291 TermEnum termEnum
= reader
.Terms();
292 SegmentMergeInfo smi
= new SegmentMergeInfo(base_Renamed
, termEnum
, reader
);
293 base_Renamed
+= reader
.NumDocs();
301 SegmentMergeInfo
[] match
= new SegmentMergeInfo
[readers
.Count
];
303 while (queue
.Size() > 0)
305 int matchSize
= 0; // pop matching terms
306 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
307 Term term
= match
[0].term
;
308 SegmentMergeInfo top
= (SegmentMergeInfo
) queue
.Top();
310 while (top
!= null && term
.CompareTo(top
.term
) == 0)
312 match
[matchSize
++] = (SegmentMergeInfo
) queue
.Pop();
313 top
= (SegmentMergeInfo
) queue
.Top();
316 MergeTermInfo(match
, matchSize
); // add new TermInfo
318 while (matchSize
> 0)
320 SegmentMergeInfo smi
= match
[--matchSize
];
325 smi
.Close(); // done with a segment
330 private TermInfo termInfo
= new TermInfo(); // minimize consing
332 /// <summary>Merge one term found in one or more segments. The array <code>smis</code>
333 /// contains segments that are positioned at the same term. <code>N</code>
334 /// is the number of cells in the array actually occupied.
337 /// <param name="smis">array of segments
339 /// <param name="n">number of cells in the array actually occupied
341 private void MergeTermInfo(SegmentMergeInfo
[] smis
, int n
)
343 long freqPointer
= freqOutput
.GetFilePointer();
344 long proxPointer
= proxOutput
.GetFilePointer();
346 int df
= AppendPostings(smis
, n
); // append posting data
348 long skipPointer
= WriteSkip();
352 // add an entry to the dictionary with pointers to prox and freq files
353 termInfo
.Set(df
, freqPointer
, proxPointer
, (int) (skipPointer
- freqPointer
));
354 termInfosWriter
.Add(smis
[0].term
, termInfo
);
358 /// <summary>Process postings from multiple segments all positioned on the
359 /// same term. Writes out merged entries into freqOutput and
360 /// the proxOutput streams.
363 /// <param name="smis">array of segments
365 /// <param name="n">number of cells in the array actually occupied
367 /// <returns> number of documents across all segments where this term was found
369 private int AppendPostings(SegmentMergeInfo
[] smis
, int n
)
372 int df
= 0; // number of docs w/ term
374 for (int i
= 0; i
< n
; i
++)
376 SegmentMergeInfo smi
= smis
[i
];
377 TermPositions postings
= smi
.postings
;
378 int base_Renamed
= smi
.base_Renamed
;
379 int[] docMap
= smi
.docMap
;
380 postings
.Seek(smi
.termEnum
);
381 while (postings
.Next())
383 int doc
= postings
.Doc();
385 doc
= docMap
[doc
]; // map around deletions
386 doc
+= base_Renamed
; // convert to merged space
389 throw new System
.SystemException("docs out of order");
393 if ((df
% skipInterval
) == 0)
398 int docCode
= (doc
- lastDoc
) << 1; // use low bit to flag freq=1
401 int freq
= postings
.Freq();
404 freqOutput
.WriteVInt(docCode
| 1); // write doc & freq=1
408 freqOutput
.WriteVInt(docCode
); // write doc
409 freqOutput
.WriteVInt(freq
); // write frequency in doc
412 int lastPosition
= 0; // write position deltas
413 for (int j
= 0; j
< freq
; j
++)
415 int position
= postings
.NextPosition();
416 proxOutput
.WriteVInt(position
- lastPosition
);
417 lastPosition
= position
;
424 private RAMOutputStream skipBuffer
= new RAMOutputStream();
425 private int lastSkipDoc
;
426 private long lastSkipFreqPointer
;
427 private long lastSkipProxPointer
;
429 private void ResetSkip()
433 lastSkipFreqPointer
= freqOutput
.GetFilePointer();
434 lastSkipProxPointer
= proxOutput
.GetFilePointer();
437 private void BufferSkip(int doc
)
439 long freqPointer
= freqOutput
.GetFilePointer();
440 long proxPointer
= proxOutput
.GetFilePointer();
442 skipBuffer
.WriteVInt(doc
- lastSkipDoc
);
443 skipBuffer
.WriteVInt((int) (freqPointer
- lastSkipFreqPointer
));
444 skipBuffer
.WriteVInt((int) (proxPointer
- lastSkipProxPointer
));
447 lastSkipFreqPointer
= freqPointer
;
448 lastSkipProxPointer
= proxPointer
;
451 private long WriteSkip()
453 long skipPointer
= freqOutput
.GetFilePointer();
454 skipBuffer
.WriteTo(freqOutput
);
458 private void MergeNorms()
460 for (int i
= 0; i
< fieldInfos
.Size(); i
++)
462 FieldInfo fi
= fieldInfos
.FieldInfo(i
);
465 OutputStream output
= directory
.CreateFile(segment
+ ".f" + i
);
468 for (int j
= 0; j
< readers
.Count
; j
++)
470 IndexReader reader
= (IndexReader
) readers
[j
];
471 byte[] input
= reader
.Norms(fi
.name
);
472 int maxDoc
= reader
.MaxDoc();
473 for (int k
= 0; k
< maxDoc
; k
++)
475 byte norm
= input
!= null?input
[k
]:(byte) 0;
476 if (!reader
.IsDeleted(k
))
478 output
.WriteByte(norm
);