2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / Lucene.Net / Index / DocumentWriter.cs
blob47c86006568578b984e282e0dd3bc50624bb0680
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Analyzer = Lucene.Net.Analysis.Analyzer;
18 using Token = Lucene.Net.Analysis.Token;
19 using TokenStream = Lucene.Net.Analysis.TokenStream;
20 using Document = Lucene.Net.Documents.Document;
21 using Field = Lucene.Net.Documents.Field;
22 using Similarity = Lucene.Net.Search.Similarity;
23 using Directory = Lucene.Net.Store.Directory;
24 using IndexOutput = Lucene.Net.Store.IndexOutput;
25 namespace Lucene.Net.Index
28 sealed public class DocumentWriter
30 private void InitBlock()
32 termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
34 private Analyzer analyzer;
35 private Directory directory;
36 private Similarity similarity;
37 private FieldInfos fieldInfos;
38 private int maxFieldLength;
39 private int termIndexInterval;
40 private System.IO.TextWriter infoStream;
42 /// <summary>This ctor used by test code only.
43 ///
44 /// </summary>
45 /// <param name="directory">The directory to write the document information to
46 /// </param>
47 /// <param name="analyzer">The analyzer to use for the document
48 /// </param>
49 /// <param name="similarity">The Similarity function
50 /// </param>
51 /// <param name="maxFieldLength">The maximum number of tokens a field may have
52 /// </param>
53 internal DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
55 InitBlock();
56 this.directory = directory;
57 this.analyzer = analyzer;
58 this.similarity = similarity;
59 this.maxFieldLength = maxFieldLength;
62 internal DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
64 InitBlock();
65 this.directory = directory;
66 this.analyzer = analyzer;
67 this.similarity = writer.GetSimilarity();
68 this.maxFieldLength = writer.GetMaxFieldLength();
69 this.termIndexInterval = writer.GetTermIndexInterval();
72 /*internal*/ public void AddDocument(System.String segment, Document doc)
74 // write Field names
75 fieldInfos = new FieldInfos();
76 fieldInfos.Add(doc);
77 fieldInfos.Write(directory, segment + ".fnm");
79 // write Field values
80 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
81 try
83 fieldsWriter.AddDocument(doc);
85 finally
87 fieldsWriter.Close();
90 // invert doc into postingTable
91 postingTable.Clear(); // clear postingTable
92 fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
93 fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
94 fieldOffsets = new int[fieldInfos.Size()]; // init fieldOffsets
96 fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
97 float boost = doc.GetBoost();
98 for (int i = 0; i < fieldBoosts.Length; i++)
100 fieldBoosts[i] = boost;
103 InvertDocument(doc);
105 // sort postingTable into an array
106 Posting[] postings = SortPostingTable();
109 for (int i = 0; i < postings.length; i++) {
110 Posting posting = postings[i];
111 System.out.print(posting.term);
112 System.out.print(" freq=" + posting.freq);
113 System.out.print(" pos=");
114 System.out.print(posting.positions[0]);
115 for (int j = 1; j < posting.freq; j++)
116 System.out.print("," + posting.positions[j]);
117 System.out.println("");
121 // write postings
122 WritePostings(postings, segment);
124 // write norms of indexed fields
125 WriteNorms(segment);
128 // Keys are Terms, values are Postings.
129 // Used to buffer a document before it is written to the index.
130 private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
131 private int[] fieldLengths;
132 private int[] fieldPositions;
133 private int[] fieldOffsets;
134 private float[] fieldBoosts;
136 // Tokenizes the fields of a document into Postings.
137 private void InvertDocument(Document doc)
139 foreach(Field field in doc.Fields())
141 System.String fieldName = field.Name();
142 int fieldNumber = fieldInfos.FieldNumber(fieldName);
144 int length = fieldLengths[fieldNumber]; // length of field
145 int position = fieldPositions[fieldNumber]; // position in field
146 int offset = fieldOffsets[fieldNumber]; // offset field
148 if (field.IsIndexed())
150 if (!field.IsTokenized())
152 // un-tokenized field
153 System.String stringValue = field.StringValue();
154 if (field.IsStoreOffsetWithTermVector())
155 AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
156 else
157 AddPosition(fieldName, stringValue, position++, null);
158 offset += stringValue.Length;
159 length++;
161 else
163 System.IO.TextReader reader; // find or make Reader
164 if (field.ReaderValue() != null)
165 reader = field.ReaderValue();
166 else if (field.StringValue() != null)
167 reader = new System.IO.StringReader(field.StringValue());
168 else
169 throw new System.ArgumentException("field must have either String or Reader value");
171 // Tokenize field and add to postingTable
172 TokenStream stream = analyzer.TokenStream(fieldName, reader);
175 Token lastToken = null;
176 for (Token t = stream.Next(); t != null; t = stream.Next())
178 position += (t.GetPositionIncrement() - 1);
180 if (field.IsStoreOffsetWithTermVector())
181 AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
182 else
183 AddPosition(fieldName, t.TermText(), position++, null);
185 lastToken = t;
186 if (++length > maxFieldLength)
188 if (infoStream != null)
189 infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
190 break;
194 if (lastToken != null)
195 offset += lastToken.EndOffset() + 1;
197 finally
199 stream.Close();
203 fieldLengths[fieldNumber] = length; // save field length
204 fieldPositions[fieldNumber] = position; // save field position
205 fieldBoosts[fieldNumber] *= field.GetBoost();
206 fieldOffsets[fieldNumber] = offset;
211 private Term termBuffer = new Term("", ""); // avoid consing
213 private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
215 termBuffer.Set(field, text);
216 //System.out.println("Offset: " + offset);
217 Posting ti = (Posting) postingTable[termBuffer];
218 if (ti != null)
220 // word seen before
221 int freq = ti.freq;
222 if (ti.positions.Length == freq)
224 // positions array is full
225 int[] newPositions = new int[freq * 2]; // double size
226 int[] positions = ti.positions;
227 for (int i = 0; i < freq; i++)
228 // copy old positions to new
229 newPositions[i] = positions[i];
230 ti.positions = newPositions;
232 ti.positions[freq] = position; // add new position
234 if (offset != null)
236 if (ti.offsets.Length == freq)
238 TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
239 TermVectorOffsetInfo[] offsets = ti.offsets;
240 for (int i = 0; i < freq; i++)
242 newOffsets[i] = offsets[i];
244 ti.offsets = newOffsets;
246 ti.offsets[freq] = offset;
248 ti.freq = freq + 1; // update frequency
250 else
252 // word not seen before
253 Term term = new Term(field, text, false);
254 postingTable[term] = new Posting(term, position, offset);
258 private Posting[] SortPostingTable()
260 // copy postingTable into an array
261 Posting[] array = new Posting[postingTable.Count];
262 System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
263 for (int i = 0; postings.MoveNext(); i++)
265 array[i] = (Posting) postings.Current;
268 // sort the array
269 QuickSort(array, 0, array.Length - 1);
271 return array;
274 private static void QuickSort(Posting[] postings, int lo, int hi)
276 if (lo >= hi)
277 return ;
279 int mid = (lo + hi) / 2;
281 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
283 Posting tmp = postings[lo];
284 postings[lo] = postings[mid];
285 postings[mid] = tmp;
288 if (postings[mid].term.CompareTo(postings[hi].term) > 0)
290 Posting tmp = postings[mid];
291 postings[mid] = postings[hi];
292 postings[hi] = tmp;
294 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
296 Posting tmp2 = postings[lo];
297 postings[lo] = postings[mid];
298 postings[mid] = tmp2;
302 int left = lo + 1;
303 int right = hi - 1;
305 if (left >= right)
306 return ;
308 Term partition = postings[mid].term;
310 for (; ; )
312 while (postings[right].term.CompareTo(partition) > 0)
313 --right;
315 while (left < right && postings[left].term.CompareTo(partition) <= 0)
316 ++left;
318 if (left < right)
320 Posting tmp = postings[left];
321 postings[left] = postings[right];
322 postings[right] = tmp;
323 --right;
325 else
327 break;
331 QuickSort(postings, lo, left);
332 QuickSort(postings, left + 1, hi);
335 private void WritePostings(Posting[] postings, System.String segment)
337 IndexOutput freq = null, prox = null;
338 TermInfosWriter tis = null;
339 TermVectorsWriter termVectorWriter = null;
342 //open files for inverse index storage
343 freq = directory.CreateOutput(segment + ".frq");
344 prox = directory.CreateOutput(segment + ".prx");
345 tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
346 TermInfo ti = new TermInfo();
347 System.String currentField = null;
349 for (int i = 0; i < postings.Length; i++)
351 Posting posting = postings[i];
353 // add an entry to the dictionary with pointers to prox and freq files
354 ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
355 tis.Add(posting.term, ti);
357 // add an entry to the freq file
358 int postingFreq = posting.freq;
359 if (postingFreq == 1)
360 // optimize freq=1
361 freq.WriteVInt(1);
362 // set low bit of doc num.
363 else
365 freq.WriteVInt(0); // the document number
366 freq.WriteVInt(postingFreq); // frequency in doc
369 int lastPosition = 0; // write positions
370 int[] positions = posting.positions;
371 for (int j = 0; j < postingFreq; j++)
373 // use delta-encoding
374 int position = positions[j];
375 prox.WriteVInt(position - lastPosition);
376 lastPosition = position;
378 // check to see if we switched to a new Field
379 System.String termField = posting.term.Field();
380 if ((System.Object) currentField != (System.Object) termField)
382 // changing Field - see if there is something to save
383 currentField = termField;
384 FieldInfo fi = fieldInfos.FieldInfo(currentField);
385 if (fi.storeTermVector)
387 if (termVectorWriter == null)
389 termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
390 termVectorWriter.OpenDocument();
392 termVectorWriter.OpenField(currentField);
394 else if (termVectorWriter != null)
396 termVectorWriter.CloseField();
399 if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
401 termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
404 if (termVectorWriter != null)
405 termVectorWriter.CloseDocument();
407 finally
409 // make an effort to close all streams we can but remember and re-throw
410 // the first exception encountered in this process
411 System.IO.IOException keep = null;
412 if (freq != null)
415 freq.Close();
417 catch (System.IO.IOException e)
419 if (keep == null)
420 keep = e;
422 if (prox != null)
425 prox.Close();
427 catch (System.IO.IOException e)
429 if (keep == null)
430 keep = e;
432 if (tis != null)
435 tis.Close();
437 catch (System.IO.IOException e)
439 if (keep == null)
440 keep = e;
442 if (termVectorWriter != null)
445 termVectorWriter.Close();
447 catch (System.IO.IOException e)
449 if (keep == null)
450 keep = e;
452 if (keep != null)
454 throw new System.IO.IOException(keep.StackTrace); // throw new System.IO.IOException(keep.StackTrace);
459 private void WriteNorms(System.String segment)
461 for (int n = 0; n < fieldInfos.Size(); n++)
463 FieldInfo fi = fieldInfos.FieldInfo(n);
464 if (fi.isIndexed)
466 float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
467 IndexOutput norms = directory.CreateOutput(segment + ".f" + n);
470 norms.WriteByte(Similarity.EncodeNorm(norm));
472 finally
474 norms.Close();
480 /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
481 internal void SetInfoStream(System.IO.TextWriter infoStream)
483 this.infoStream = infoStream;
487 sealed class Posting
489 // info about a Term in a doc
490 internal Term term; // the Term
491 internal int freq; // its frequency in doc
492 internal int[] positions; // positions it occurs at
493 internal TermVectorOffsetInfo[] offsets;
495 internal Posting(Term t, int position, TermVectorOffsetInfo offset)
497 term = t;
498 freq = 1;
499 positions = new int[1];
500 positions[0] = position;
501 if (offset != null)
503 offsets = new TermVectorOffsetInfo[1];
504 offsets[0] = offset;
506 else
507 offsets = null;