Initial revision
[beagle.git] / Lucene.Net / Index / SegmentMerger.cs
blob47a8f5d76c015cbc579d73d0d29a01b0ad7b517c
1 using System;
2 using System.Collections;
4 using Lucene.Net.Store;
5 using Lucene.Net.Util;
7 namespace Lucene.Net.Index
9 /* ====================================================================
10 * The Apache Software License, Version 1.1
12 * Copyright (c) 2001 The Apache Software Foundation. All rights
13 * reserved.
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in
24 * the documentation and/or other materials provided with the
25 * distribution.
27 * 3. The end-user documentation included with the redistribution,
28 * if any, must include the following acknowledgment:
29 * "This product includes software developed by the
30 * Apache Software Foundation (http://www.apache.org/)."
31 * Alternately, this acknowledgment may appear in the software itself,
32 * if and wherever such third-party acknowledgments normally appear.
34 * 4. The names "Apache" and "Apache Software Foundation" and
35 * "Apache Lucene" must not be used to endorse or promote products
36 * derived from this software without prior written permission. For
37 * written permission, please contact apache@apache.org.
39 * 5. Products derived from this software may not be called "Apache",
40 * "Apache Lucene", nor may "Apache" appear in their name, without
41 * prior written permission of the Apache Software Foundation.
43 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
45 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
47 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 * ====================================================================
57 * This software consists of voluntary contributions made by many
58 * individuals on behalf of the Apache Software Foundation. For more
59 * information on the Apache Software Foundation, please see
60 * <http://www.apache.org/>.
63 public sealed class SegmentMerger
65 private bool useCompoundFile;
66 private Directory directory;
67 private String segment;
69 private ArrayList readers = new ArrayList();
70 private FieldInfos fieldInfos;
72 // File extensions of old-style index files
73 private static string[] COMPOUND_EXTENSIONS = new string[]
74 {"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"};
76 public SegmentMerger(Directory dir, String name, bool compoundFile)
78 directory = dir;
79 segment = name;
80 useCompoundFile = compoundFile;
83 public void Add(IndexReader reader)
85 readers.Add(reader);
88 public IndexReader SegmentReader(int i)
90 return (IndexReader)readers[i];
93 public int Merge()
95 int _value;
97 try
99 MergeFields();
100 MergeTerms();
101 _value = MergeNorms();
103 finally
105 for (int i = 0; i < readers.Count; i++)
106 { // close readers
107 IndexReader reader = (IndexReader)readers[i];
108 reader.Close();
112 if (useCompoundFile)
113 CreateCompoundFile();
115 return _value;
118 private void CreateCompoundFile()
120 CompoundFileWriter cfsWriter =
121 new CompoundFileWriter(directory, segment + ".cfs");
123 ArrayList files =
124 new ArrayList(COMPOUND_EXTENSIONS.Length + fieldInfos.Size());
126 // Basic files
127 for (int i=0; i<COMPOUND_EXTENSIONS.Length; i++)
129 files.Add(segment + "." + COMPOUND_EXTENSIONS[i]);
132 // Field norm files
133 for (int i = 0; i < fieldInfos.Size(); i++)
135 FieldInfo fi = fieldInfos.FieldInfo(i);
136 if (fi.isIndexed)
138 files.Add(segment + ".f" + i);
142 // Now merge all added files
143 foreach(string file in files)
145 cfsWriter.AddFile(file);
148 // Perform the merge
149 cfsWriter.Close();
151 // Now delete the source files
152 foreach(string file in files)
154 directory.DeleteFile(file);
158 private void MergeFields()
160 fieldInfos = new FieldInfos(); // merge field names
161 for (int i = 0; i < readers.Count; i++)
163 IndexReader reader = (IndexReader)readers[i];
165 fieldInfos.Add(reader.GetFieldNames(true), true);
166 fieldInfos.Add(reader.GetFieldNames(false), false);
168 fieldInfos.Write(directory, segment + ".fnm");
170 FieldsWriter fieldsWriter = // merge field values
171 new FieldsWriter(directory, segment, fieldInfos);
172 try
174 for (int i = 0; i < readers.Count; i++)
176 IndexReader reader = (IndexReader)readers[i];
178 int maxDoc = reader.MaxDoc();
179 for (int j = 0; j < maxDoc; j++)
180 if (!reader.IsDeleted(j)) // skip deleted docs
181 fieldsWriter.AddDocument(reader.Document(j));
184 finally
186 fieldsWriter.Close();
190 private OutputStream freqOutput = null;
191 private OutputStream proxOutput = null;
192 private TermInfosWriter termInfosWriter = null;
193 private SegmentMergeQueue queue = null;
195 private void MergeTerms()
197 try
199 freqOutput = directory.CreateFile(segment + ".frq");
200 proxOutput = directory.CreateFile(segment + ".prx");
201 termInfosWriter =
202 new TermInfosWriter(directory, segment, fieldInfos);
204 MergeTermInfos();
206 finally
208 if (freqOutput != null) freqOutput.Close();
209 if (proxOutput != null) proxOutput.Close();
210 if (termInfosWriter != null) termInfosWriter.Close();
211 if (queue != null) queue.Close();
215 private void MergeTermInfos()
217 queue = new SegmentMergeQueue(readers.Count);
218 int _base = 0;
219 for (int i = 0; i < readers.Count; i++)
221 IndexReader reader = (IndexReader)readers[i];
222 TermEnum termEnum = reader.Terms();
223 SegmentMergeInfo smi = new SegmentMergeInfo(_base, termEnum, reader);
224 _base += reader.NumDocs();
225 if (smi.Next())
226 queue.Put(smi); // initialize queue
227 else
228 smi.Close();
231 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];
233 while (queue.Size() > 0)
235 int matchSize = 0; // pop matching terms
236 match[matchSize++] = (SegmentMergeInfo)queue.Pop();
237 Term term = match[0].term;
238 SegmentMergeInfo top = (SegmentMergeInfo)queue.Top();
240 while (top != null && term.CompareTo(top.term) == 0)
242 match[matchSize++] = (SegmentMergeInfo)queue.Pop();
243 top = (SegmentMergeInfo)queue.Top();
246 MergeTermInfo(match, matchSize); // add new TermInfo
248 while (matchSize > 0)
250 SegmentMergeInfo smi = match[--matchSize];
251 if (smi.Next())
252 queue.Put(smi); // restore queue
253 else
254 smi.Close(); // done with a segment
259 private readonly TermInfo termInfo = new TermInfo(); // minimize consing
261 private void MergeTermInfo(SegmentMergeInfo[] smis, int n)
263 long freqPointer = freqOutput.GetFilePointer();
264 long proxPointer = proxOutput.GetFilePointer();
266 int df = AppendPostings(smis, n); // append posting data
268 if (df > 0)
270 // add an entry to the dictionary with pointers to prox and freq files
271 termInfo.Set(df, freqPointer, proxPointer);
272 termInfosWriter.Add(smis[0].term, termInfo);
276 private int AppendPostings(SegmentMergeInfo[] smis, int n)
278 int lastDoc = 0;
279 int df = 0; // number of docs w/ term
280 for (int i = 0; i < n; i++)
282 SegmentMergeInfo smi = smis[i];
283 TermPositions postings = smi.postings;
284 int _base = smi._base;
285 int[] docMap = smi.docMap;
286 postings.Seek(smi.termEnum);
287 while (postings.Next())
289 int doc = postings.Doc();
290 if (docMap != null)
291 doc = docMap[doc]; // map around deletions
292 doc += _base; // convert to merged space
294 if (doc < lastDoc)
295 throw new InvalidOperationException("docs out of order");
297 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
298 lastDoc = doc;
300 int freq = postings.Freq();
301 if (freq == 1)
303 freqOutput.WriteVInt(docCode | 1); // write doc & freq=1
305 else
307 freqOutput.WriteVInt(docCode); // write doc
308 freqOutput.WriteVInt(freq); // write frequency in doc
311 int lastPosition = 0; // write position deltas
312 for (int j = 0; j < freq; j++)
314 int position = postings.NextPosition();
315 proxOutput.WriteVInt(position - lastPosition);
316 lastPosition = position;
319 df++;
322 return df;
325 private int MergeNorms()
327 int docCount = 0;
328 for (int i = 0; i < fieldInfos.Size(); i++)
330 FieldInfo fi = fieldInfos.FieldInfo(i);
331 if (fi.isIndexed)
333 OutputStream output = directory.CreateFile(segment + ".f" + i);
334 try
336 for (int j = 0; j < readers.Count; j++)
338 IndexReader reader = (IndexReader)readers[j];
339 byte[] input = reader.Norms(fi.name);
341 int maxDoc = reader.MaxDoc();
342 for (int k = 0; k < maxDoc; k++)
344 byte norm = input != null ? input[k] : (byte)0;
345 if (!reader.IsDeleted(k))
347 output.WriteByte(norm);
348 docCount++;
353 finally
355 output.Close();
360 return docCount;