Initial revision
[beagle.git] / Lucene.Net / Index / DocumentWriter.cs
blob3c987ec8101fd54d28645686c1c35b285c136105
1 using System;
2 using System.IO;
3 using System.Collections;
5 using Lucene.Net.Documents;
6 using Lucene.Net.Analysis;
7 using Lucene.Net.Store;
8 using Lucene.Net.Search;
10 namespace Lucene.Net.Index
12 /* ====================================================================
13 * The Apache Software License, Version 1.1
15 * Copyright (c) 2001 The Apache Software Foundation. All rights
16 * reserved.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in
27 * the documentation and/or other materials provided with the
28 * distribution.
30 * 3. The end-user documentation included with the redistribution,
31 * if any, must include the following acknowledgment:
32 * "This product includes software developed by the
33 * Apache Software Foundation (http://www.apache.org/)."
34 * Alternately, this acknowledgment may appear in the software itself,
35 * if and wherever such third-party acknowledgments normally appear.
37 * 4. The names "Apache" and "Apache Software Foundation" and
38 * "Apache Lucene" must not be used to endorse or promote products
39 * derived from this software without prior written permission. For
40 * written permission, please contact apache@apache.org.
42 * 5. Products derived from this software may not be called "Apache",
43 * "Apache Lucene", nor may "Apache" appear in their name, without
44 * prior written permission of the Apache Software Foundation.
46 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
47 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
49 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
50 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
51 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
52 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
53 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
54 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
55 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
56 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 * ====================================================================
60 * This software consists of voluntary contributions made by many
61 * individuals on behalf of the Apache Software Foundation. For more
62 * information on the Apache Software Foundation, please see
63 * <http://www.apache.org/>.
66 public sealed class DocumentWriter
68 private Analyzer analyzer;
69 private Lucene.Net.Store.Directory directory;
70 private Similarity similarity;
71 private FieldInfos fieldInfos;
72 private int maxFieldLength;
74 public DocumentWriter(Lucene.Net.Store.Directory directory, Analyzer analyzer,
75 Similarity similarity, int maxFieldLength)
77 this.directory = directory;
78 this.analyzer = analyzer;
79 this.similarity = similarity;
80 this.maxFieldLength = maxFieldLength;
83 public void AddDocument(String segment, Document doc)
85 // write field names
86 fieldInfos = new FieldInfos();
87 fieldInfos.Add(doc);
88 fieldInfos.Write(directory, segment + ".fnm");
90 // write field values
91 FieldsWriter fieldsWriter =
92 new FieldsWriter(directory, segment, fieldInfos);
93 try
95 fieldsWriter.AddDocument(doc);
97 finally
99 fieldsWriter.Close();
102 // invert doc into postingTable
103 postingTable.Clear(); // clear postingTable
104 fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
106 fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
107 float boost = doc.GetBoost();
108 for (int i = 0; i < fieldBoosts.Length; i++)
110 fieldBoosts[i] = boost;
113 InvertDocument(doc);
115 // sort postingTable into an array
116 Posting[] postings = SortPostingTable();
119 for (int i = 0; i < postings.length; i++) {
120 Posting posting = postings[i];
121 System.out.print(posting.term);
122 System.out.print(" freq=" + posting.freq);
123 System.out.print(" pos=");
124 System.out.print(posting.positions[0]);
125 for (int j = 1; j < posting.freq; j++)
126 System.out.print("," + posting.positions[j]);
127 System.out.println("");
131 // write postings
132 WritePostings(postings, segment);
134 // write norms of indexed fields
135 WriteNorms(doc, segment);
139 // Keys are Terms, values are Postings.
140 // Used to buffer a document before it is written to the index.
142 private readonly Hashtable postingTable = new Hashtable();
143 private int[] fieldLengths;
144 private float[] fieldBoosts;
146 /// <summary>
147 /// Tokenizes the fields of a document into Postings.
148 /// </summary>
149 /// <param name="doc"></param>
150 private void InvertDocument(Document doc)
152 foreach (Field field in doc.Fields())
154 String fieldName = field.Name();
155 int fieldNumber = fieldInfos.FieldNumber(fieldName);
157 int position = fieldLengths[fieldNumber]; // position in field
159 if (field.IsIndexed())
161 if (!field.IsTokenized())
162 { // un-tokenized field
163 AddPosition(fieldName, field.StringValue(), position++);
165 else
167 TextReader reader; // find or make Reader
168 if (field.ReaderValue() != null)
170 reader = field.ReaderValue();
172 else if (field.StringValue() != null)
173 reader = new StringReader(field.StringValue());
174 else
175 throw new ArgumentException
176 ("field must have either String or Reader value");
178 // Tokenize field and add to postingTable
179 TokenStream stream = analyzer.TokenStream(fieldName, reader);
180 try
182 for (Token t = stream.Next(); t != null; t = stream.Next())
184 position += (t.GetPositionIncrement() - 1);
185 AddPosition(fieldName, t.TermText(), position++);
186 if (position > maxFieldLength) break;
189 finally
191 stream.Close();
195 fieldLengths[fieldNumber] = position; // save field length
196 fieldBoosts[fieldNumber] *= field.GetBoost();
201 private readonly Term termBuffer = new Term("", ""); // avoid consing
203 private void AddPosition(String field, String text, int position)
205 termBuffer.Set(field, text);
206 Posting ti = (Posting)postingTable[termBuffer];
207 if (ti != null)
208 { // word seen before
209 int freq = ti.freq;
210 if (ti.positions.Length == freq)
211 { // positions array is full
212 int[] newPositions = new int[freq * 2]; // double size
213 int[] positions = ti.positions;
214 for (int i = 0; i < freq; i++) // copy old positions to new
215 newPositions[i] = positions[i];
216 ti.positions = newPositions;
218 ti.positions[freq] = position; // add new position
219 ti.freq = freq + 1; // update frequency
221 else
222 { // word not seen before
223 Term term = new Term(field, text, false);
224 postingTable.Add(term, new Posting(term, position));
228 private Posting[] SortPostingTable()
230 // copy postingTable into an array
231 Posting[] array = new Posting[postingTable.Count];
233 int i = 0;
234 foreach (Posting posting in postingTable.Values)
236 array[i] = posting;
237 i++;
240 // sort the array
241 QuickSort(array, 0, array.Length - 1);
243 return array;
246 private static void QuickSort(Posting[] postings, int lo, int hi)
248 if(lo >= hi)
249 return;
251 int mid = (lo + hi) / 2;
253 if(postings[lo].term.CompareTo(postings[mid].term) > 0)
255 Posting tmp = postings[lo];
256 postings[lo] = postings[mid];
257 postings[mid] = tmp;
260 if(postings[mid].term.CompareTo(postings[hi].term) > 0)
262 Posting tmp = postings[mid];
263 postings[mid] = postings[hi];
264 postings[hi] = tmp;
266 if(postings[lo].term.CompareTo(postings[mid].term) > 0)
268 Posting tmp2 = postings[lo];
269 postings[lo] = postings[mid];
270 postings[mid] = tmp2;
274 int left = lo + 1;
275 int right = hi - 1;
277 if (left >= right)
278 return;
280 Term partition = postings[mid].term;
282 for( ;; )
284 while(postings[right].term.CompareTo(partition) > 0)
285 --right;
287 while(left < right && postings[left].term.CompareTo(partition) <= 0)
288 ++left;
290 if(left < right)
292 Posting tmp = postings[left];
293 postings[left] = postings[right];
294 postings[right] = tmp;
295 --right;
297 else
299 break;
303 QuickSort(postings, lo, left);
304 QuickSort(postings, left + 1, hi);
307 private void WritePostings(Posting[] postings, String segment)
309 OutputStream freq = null, prox = null;
310 TermInfosWriter tis = null;
312 try
314 freq = directory.CreateFile(segment + ".frq");
315 prox = directory.CreateFile(segment + ".prx");
316 tis = new TermInfosWriter(directory, segment, fieldInfos);
317 TermInfo ti = new TermInfo();
319 for (int i = 0; i < postings.Length; i++)
321 Posting posting = postings[i];
323 // add an entry to the dictionary with pointers to prox and freq files
324 ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer());
325 tis.Add(posting.term, ti);
327 // add an entry to the freq file
328 int f = posting.freq;
329 if (f == 1) // optimize freq=1
330 freq.WriteVInt(1); // set low bit of doc num.
331 else
333 freq.WriteVInt(0); // the document number
334 freq.WriteVInt(f); // frequency in doc
337 int lastPosition = 0; // write positions
338 int[] positions = posting.positions;
339 for (int j = 0; j < f; j++)
340 { // use delta-encoding
341 int position = positions[j];
342 prox.WriteVInt(position - lastPosition);
343 lastPosition = position;
347 finally
349 if (freq != null) freq.Close();
350 if (prox != null) prox.Close();
351 if (tis != null) tis.Close();
355 private void WriteNorms(Document doc, String segment)
357 foreach(Field field in doc.Fields())
359 if (field.IsIndexed())
361 int n = fieldInfos.FieldNumber(field.Name());
362 float norm =
363 fieldBoosts[n] * similarity.LengthNorm(field.Name(),fieldLengths[n]);
364 OutputStream norms = directory.CreateFile(segment + ".f" + n);
365 try
367 norms.WriteByte(Similarity.EncodeNorm(norm));
369 finally
371 norms.Close();
378 sealed class Posting
380 // info about a Term in a doc
381 internal Term term; // the Term
382 internal int freq; // its frequency in doc
383 internal int[] positions; // positions it occurs at
385 internal Posting(Term t, int position)
387 term = t;
388 freq = 1;
389 positions = new int[1];
390 positions[0] = position;