Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Index / DocumentWriter.cs
blob2b3691bc24db589901773cfe7736538f3978de77
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Analyzer = Lucene.Net.Analysis.Analyzer;
18 using Token = Lucene.Net.Analysis.Token;
19 using TokenStream = Lucene.Net.Analysis.TokenStream;
20 using Document = Lucene.Net.Documents.Document;
21 using Field = Lucene.Net.Documents.Field;
22 using Similarity = Lucene.Net.Search.Similarity;
23 using Directory = Lucene.Net.Store.Directory;
24 using OutputStream = Lucene.Net.Store.OutputStream;
25 namespace Lucene.Net.Index
28 sealed public class DocumentWriter
30 private Analyzer analyzer;
31 private Directory directory;
32 private Similarity similarity;
33 private FieldInfos fieldInfos;
34 private int maxFieldLength;
36 /// <summary> </summary>
37 /// <param name="directory">The directory to write the document information to
38 /// </param>
39 /// <param name="analyzer">The analyzer to use for the document
40 /// </param>
41 /// <param name="similarity">The Similarity function
42 /// </param>
43 /// <param name="maxFieldLength">The maximum number of tokens a Field may have
44 /// </param>
45 public /*internal*/ DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
47 this.directory = directory;
48 this.analyzer = analyzer;
49 this.similarity = similarity;
50 this.maxFieldLength = maxFieldLength;
53 /*internal*/ public void AddDocument(System.String segment, Document doc)
55 // write Field names
56 fieldInfos = new FieldInfos();
57 fieldInfos.Add(doc);
58 fieldInfos.Write(directory, segment + ".fnm");
60 // write Field values
61 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
62 try
64 fieldsWriter.AddDocument(doc);
66 finally
68 fieldsWriter.Close();
71 // invert doc into postingTable
72 postingTable.Clear(); // clear postingTable
73 fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
74 fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
76 fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
77 float boost = doc.GetBoost();
78 for (int i = 0; i < fieldBoosts.Length; i++)
80 fieldBoosts[i] = boost;
83 InvertDocument(doc);
85 // sort postingTable into an array
86 Posting[] postings = SortPostingTable();
89 for (int i = 0; i < postings.length; i++) {
90 Posting posting = postings[i];
91 System.out.print(posting.term);
92 System.out.print(" freq=" + posting.freq);
93 System.out.print(" pos=");
94 System.out.print(posting.positions[0]);
95 for (int j = 1; j < posting.freq; j++)
96 System.out.print("," + posting.positions[j]);
97 System.out.println("");
101 // write postings
102 WritePostings(postings, segment);
104 // write norms of indexed fields
105 WriteNorms(doc, segment);
108 // Keys are Terms, values are Postings.
109 // Used to buffer a document before it is written to the index.
110 private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
111 private int[] fieldLengths;
112 private int[] fieldPositions;
113 private float[] fieldBoosts;
115 // Tokenizes the fields of a document into Postings.
116 private void InvertDocument(Document doc)
118 foreach(Field field in doc.Fields())
120 System.String fieldName = field.Name();
121 int fieldNumber = fieldInfos.FieldNumber(fieldName);
123 int length = fieldLengths[fieldNumber]; // length of Field
124 int position = fieldPositions[fieldNumber]; // position in Field
126 if (field.IsIndexed())
128 if (!field.IsTokenized())
130 // un-tokenized Field
131 AddPosition(fieldName, field.StringValue(), position++);
132 length++;
134 else
136 System.IO.TextReader reader; // find or make Reader
137 if (field.ReaderValue() != null)
138 reader = field.ReaderValue();
139 else if (field.StringValue() != null)
140 reader = new System.IO.StringReader(field.StringValue());
141 else
142 throw new System.ArgumentException("Field must have either String or Reader value");
144 // Tokenize Field and add to postingTable
145 TokenStream stream = analyzer.TokenStream(fieldName, reader);
148 for (Token t = stream.Next(); t != null; t = stream.Next())
150 position += (t.GetPositionIncrement() - 1);
151 AddPosition(fieldName, t.TermText(), position++);
152 if (++length > maxFieldLength)
153 break;
156 finally
158 stream.Close();
162 fieldLengths[fieldNumber] = length; // save Field length
163 fieldPositions[fieldNumber] = position; // save Field position
164 fieldBoosts[fieldNumber] *= field.GetBoost();
169 private Term termBuffer = new Term("", ""); // avoid consing
171 private void AddPosition(System.String field, System.String text, int position)
173 termBuffer.Set(field, text);
174 Posting ti = (Posting) postingTable[termBuffer];
175 if (ti != null)
177 // word seen before
178 int freq = ti.freq;
179 if (ti.positions.Length == freq)
181 // positions array is full
182 int[] newPositions = new int[freq * 2]; // double size
183 int[] positions = ti.positions;
184 for (int i = 0; i < freq; i++)
185 // copy old positions to new
186 newPositions[i] = positions[i];
187 ti.positions = newPositions;
189 ti.positions[freq] = position; // add new position
190 ti.freq = freq + 1; // update frequency
192 else
194 // word not seen before
195 Term term = new Term(field, text, false);
196 postingTable[term] = new Posting(term, position);
200 private Posting[] SortPostingTable()
202 // copy postingTable into an array
203 Posting[] array = new Posting[postingTable.Count];
204 System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
205 for (int i = 0; postings.MoveNext(); i++)
207 array[i] = (Posting) postings.Current;
210 // sort the array
211 QuickSort(array, 0, array.Length - 1);
213 return array;
216 private static void QuickSort(Posting[] postings, int lo, int hi)
218 if (lo >= hi)
219 return ;
221 int mid = (lo + hi) / 2;
223 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
225 Posting tmp = postings[lo];
226 postings[lo] = postings[mid];
227 postings[mid] = tmp;
230 if (postings[mid].term.CompareTo(postings[hi].term) > 0)
232 Posting tmp = postings[mid];
233 postings[mid] = postings[hi];
234 postings[hi] = tmp;
236 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
238 Posting tmp2 = postings[lo];
239 postings[lo] = postings[mid];
240 postings[mid] = tmp2;
244 int left = lo + 1;
245 int right = hi - 1;
247 if (left >= right)
248 return ;
250 Term partition = postings[mid].term;
252 for (; ; )
254 while (postings[right].term.CompareTo(partition) > 0)
255 --right;
257 while (left < right && postings[left].term.CompareTo(partition) <= 0)
258 ++left;
260 if (left < right)
262 Posting tmp = postings[left];
263 postings[left] = postings[right];
264 postings[right] = tmp;
265 --right;
267 else
269 break;
273 QuickSort(postings, lo, left);
274 QuickSort(postings, left + 1, hi);
277 private void WritePostings(Posting[] postings, System.String segment)
279 OutputStream freq = null, prox = null;
280 TermInfosWriter tis = null;
281 TermVectorsWriter termVectorWriter = null;
284 //open files for inverse index storage
285 freq = directory.CreateFile(segment + ".frq");
286 prox = directory.CreateFile(segment + ".prx");
287 tis = new TermInfosWriter(directory, segment, fieldInfos);
288 TermInfo ti = new TermInfo();
289 System.String currentField = null;
291 for (int i = 0; i < postings.Length; i++)
293 Posting posting = postings[i];
295 // add an entry to the dictionary with pointers to prox and freq files
296 ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
297 tis.Add(posting.term, ti);
299 // add an entry to the freq file
300 int postingFreq = posting.freq;
301 if (postingFreq == 1)
302 // optimize freq=1
303 freq.WriteVInt(1);
304 // set low bit of doc num.
305 else
307 freq.WriteVInt(0); // the document number
308 freq.WriteVInt(postingFreq); // frequency in doc
311 int lastPosition = 0; // write positions
312 int[] positions = posting.positions;
313 for (int j = 0; j < postingFreq; j++)
315 // use delta-encoding
316 int position = positions[j];
317 prox.WriteVInt(position - lastPosition);
318 lastPosition = position;
320 // check to see if we switched to a new Field
321 System.String termField = posting.term.Field();
322 if ((System.Object) currentField != (System.Object) termField)
324 // changing Field - see if there is something to save
325 currentField = termField;
326 FieldInfo fi = fieldInfos.FieldInfo(currentField);
327 if (fi.storeTermVector)
329 if (termVectorWriter == null)
331 termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
332 termVectorWriter.OpenDocument();
334 termVectorWriter.OpenField(currentField);
336 else if (termVectorWriter != null)
338 termVectorWriter.CloseField();
341 if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
343 termVectorWriter.AddTerm(posting.term.Text(), postingFreq);
346 if (termVectorWriter != null)
347 termVectorWriter.CloseDocument();
349 finally
351 // make an effort to close all streams we can but remember and re-throw
352 // the first exception encountered in this process
353 System.IO.IOException keep = null;
354 if (freq != null)
357 freq.Close();
359 catch (System.IO.IOException e)
361 if (keep == null)
362 keep = e;
364 if (prox != null)
367 prox.Close();
369 catch (System.IO.IOException e)
371 if (keep == null)
372 keep = e;
374 if (tis != null)
377 tis.Close();
379 catch (System.IO.IOException e)
381 if (keep == null)
382 keep = e;
384 if (termVectorWriter != null)
387 termVectorWriter.Close();
389 catch (System.IO.IOException e)
391 if (keep == null)
392 keep = e;
394 if (keep != null)
396 throw new System.IO.IOException(keep.StackTrace);
401 private void WriteNorms(Document doc, System.String segment)
403 for (int n = 0; n < fieldInfos.Size(); n++)
405 FieldInfo fi = fieldInfos.FieldInfo(n);
406 if (fi.isIndexed)
408 float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
409 OutputStream norms = directory.CreateFile(segment + ".f" + n);
412 norms.WriteByte(Lucene.Net.Search.Similarity.EncodeNorm(norm));
414 finally
416 norms.Close();
423 sealed class Posting
425 // info about a Term in a doc
426 internal Term term; // the Term
427 internal int freq; // its frequency in doc
428 internal int[] positions; // positions it occurs at
430 internal Posting(Term t, int position)
432 term = t;
433 freq = 1;
434 positions = new int[1];
435 positions[0] = position;