cvsimport
[beagle.git] / beagled / Lucene.Net / Index / DocumentWriter.cs
blob9583c42e78303dc16b33fbb72ce66fc4b1ac0a9b
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Analyzer = Lucene.Net.Analysis.Analyzer;
19 using Token = Lucene.Net.Analysis.Token;
20 using TokenStream = Lucene.Net.Analysis.TokenStream;
21 using Document = Lucene.Net.Documents.Document;
22 using Field = Lucene.Net.Documents.Field;
23 using Similarity = Lucene.Net.Search.Similarity;
24 using Directory = Lucene.Net.Store.Directory;
25 using IndexOutput = Lucene.Net.Store.IndexOutput;
27 namespace Lucene.Net.Index
30 public sealed class DocumentWriter
32 private void InitBlock()
34 termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
36 private Analyzer analyzer;
37 private Directory directory;
38 private Similarity similarity;
39 private FieldInfos fieldInfos;
40 private int maxFieldLength;
41 private int termIndexInterval;
42 private System.IO.TextWriter infoStream;
44 /// <summary>This ctor used by test code only.
45 ///
46 /// </summary>
47 /// <param name="directory">The directory to write the document information to
48 /// </param>
49 /// <param name="analyzer">The analyzer to use for the document
50 /// </param>
51 /// <param name="similarity">The Similarity function
52 /// </param>
53 /// <param name="maxFieldLength">The maximum number of tokens a field may have
54 /// </param>
55 public DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
57 InitBlock();
58 this.directory = directory;
59 this.analyzer = analyzer;
60 this.similarity = similarity;
61 this.maxFieldLength = maxFieldLength;
64 public DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
66 InitBlock();
67 this.directory = directory;
68 this.analyzer = analyzer;
69 this.similarity = writer.GetSimilarity();
70 this.maxFieldLength = writer.GetMaxFieldLength();
71 this.termIndexInterval = writer.GetTermIndexInterval();
74 public /*internal*/ void AddDocument(System.String segment, Document doc)
76 // write field names
77 fieldInfos = new FieldInfos();
78 fieldInfos.Add(doc);
79 fieldInfos.Write(directory, segment + ".fnm");
81 // write field values
82 FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
83 try
85 fieldsWriter.AddDocument(doc);
87 finally
89 fieldsWriter.Close();
92 // invert doc into postingTable
93 postingTable.Clear(); // clear postingTable
94 fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
95 fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
96 fieldOffsets = new int[fieldInfos.Size()]; // init fieldOffsets
98 fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
99 float boost = doc.GetBoost();
100 for (int i = 0; i < fieldBoosts.Length; i++)
102 fieldBoosts[i] = boost;
105 InvertDocument(doc);
107 // sort postingTable into an array
108 Posting[] postings = SortPostingTable();
111 for (int i = 0; i < postings.length; i++) {
112 Posting posting = postings[i];
113 System.out.print(posting.term);
114 System.out.print(" freq=" + posting.freq);
115 System.out.print(" pos=");
116 System.out.print(posting.positions[0]);
117 for (int j = 1; j < posting.freq; j++)
118 System.out.print("," + posting.positions[j]);
119 System.out.println("");
123 // write postings
124 WritePostings(postings, segment);
126 // write norms of indexed fields
127 WriteNorms(segment);
130 // Keys are Terms, values are Postings.
131 // Used to buffer a document before it is written to the index.
132 private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
133 private int[] fieldLengths;
134 private int[] fieldPositions;
135 private int[] fieldOffsets;
136 private float[] fieldBoosts;
138 // Tokenizes the fields of a document into Postings.
139 private void InvertDocument(Document doc)
141 foreach(Field field in doc.Fields())
143 System.String fieldName = field.Name();
144 int fieldNumber = fieldInfos.FieldNumber(fieldName);
146 int length = fieldLengths[fieldNumber]; // length of field
147 int position = fieldPositions[fieldNumber]; // position in field
148 if (length > 0)
149 position += analyzer.GetPositionIncrementGap(fieldName);
150 int offset = fieldOffsets[fieldNumber]; // offset field
152 if (field.IsIndexed())
154 if (!field.IsTokenized())
156 // un-tokenized field
157 System.String stringValue = field.StringValue();
158 if (field.IsStoreOffsetWithTermVector())
159 AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
160 else
161 AddPosition(fieldName, stringValue, position++, null);
162 offset += stringValue.Length;
163 length++;
165 else
167 System.IO.TextReader reader; // find or make Reader
168 if (field.ReaderValue() != null)
169 reader = field.ReaderValue();
170 else if (field.StringValue() != null)
171 reader = new System.IO.StringReader(field.StringValue());
172 else
173 throw new System.ArgumentException("field must have either String or Reader value");
175 // Tokenize field and add to postingTable
176 TokenStream stream = analyzer.TokenStream(fieldName, reader);
179 Token lastToken = null;
180 for (Token t = stream.Next(); t != null; t = stream.Next())
182 position += (t.GetPositionIncrement() - 1);
184 if (field.IsStoreOffsetWithTermVector())
185 AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
186 else
187 AddPosition(fieldName, t.TermText(), position++, null);
189 lastToken = t;
190 if (++length > maxFieldLength)
192 if (infoStream != null)
193 infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
194 break;
198 if (lastToken != null)
199 offset += lastToken.EndOffset() + 1;
201 finally
203 stream.Close();
207 fieldLengths[fieldNumber] = length; // save field length
208 fieldPositions[fieldNumber] = position; // save field position
209 fieldBoosts[fieldNumber] *= field.GetBoost();
210 fieldOffsets[fieldNumber] = offset;
215 private Term termBuffer = new Term("", ""); // avoid consing
217 private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
219 termBuffer.Set(field, text);
220 //System.out.println("Offset: " + offset);
221 Posting ti = (Posting) postingTable[termBuffer];
222 if (ti != null)
224 // word seen before
225 int freq = ti.freq;
226 if (ti.positions.Length == freq)
228 // positions array is full
229 int[] newPositions = new int[freq * 2]; // double size
230 int[] positions = ti.positions;
231 for (int i = 0; i < freq; i++)
232 // copy old positions to new
233 newPositions[i] = positions[i];
234 ti.positions = newPositions;
236 ti.positions[freq] = position; // add new position
238 if (offset != null)
240 if (ti.offsets.Length == freq)
242 TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
243 TermVectorOffsetInfo[] offsets = ti.offsets;
244 for (int i = 0; i < freq; i++)
246 newOffsets[i] = offsets[i];
248 ti.offsets = newOffsets;
250 ti.offsets[freq] = offset;
252 ti.freq = freq + 1; // update frequency
254 else
256 // word not seen before
257 Term term = new Term(field, text, false);
258 postingTable[term] = new Posting(term, position, offset);
262 private Posting[] SortPostingTable()
264 // copy postingTable into an array
265 Posting[] array = new Posting[postingTable.Count];
266 System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
267 for (int i = 0; postings.MoveNext(); i++)
269 array[i] = (Posting) postings.Current;
272 // sort the array
273 QuickSort(array, 0, array.Length - 1);
275 return array;
278 private static void QuickSort(Posting[] postings, int lo, int hi)
280 if (lo >= hi)
281 return ;
283 int mid = (lo + hi) / 2;
285 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
287 Posting tmp = postings[lo];
288 postings[lo] = postings[mid];
289 postings[mid] = tmp;
292 if (postings[mid].term.CompareTo(postings[hi].term) > 0)
294 Posting tmp = postings[mid];
295 postings[mid] = postings[hi];
296 postings[hi] = tmp;
298 if (postings[lo].term.CompareTo(postings[mid].term) > 0)
300 Posting tmp2 = postings[lo];
301 postings[lo] = postings[mid];
302 postings[mid] = tmp2;
306 int left = lo + 1;
307 int right = hi - 1;
309 if (left >= right)
310 return ;
312 Term partition = postings[mid].term;
314 for (; ; )
316 while (postings[right].term.CompareTo(partition) > 0)
317 --right;
319 while (left < right && postings[left].term.CompareTo(partition) <= 0)
320 ++left;
322 if (left < right)
324 Posting tmp = postings[left];
325 postings[left] = postings[right];
326 postings[right] = tmp;
327 --right;
329 else
331 break;
335 QuickSort(postings, lo, left);
336 QuickSort(postings, left + 1, hi);
339 private void WritePostings(Posting[] postings, System.String segment)
341 IndexOutput freq = null, prox = null;
342 TermInfosWriter tis = null;
343 TermVectorsWriter termVectorWriter = null;
346 //open files for inverse index storage
347 freq = directory.CreateOutput(segment + ".frq");
348 prox = directory.CreateOutput(segment + ".prx");
349 tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
350 TermInfo ti = new TermInfo();
351 System.String currentField = null;
353 for (int i = 0; i < postings.Length; i++)
355 Posting posting = postings[i];
357 // add an entry to the dictionary with pointers to prox and freq files
358 ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
359 tis.Add(posting.term, ti);
361 // add an entry to the freq file
362 int postingFreq = posting.freq;
363 if (postingFreq == 1)
364 // optimize freq=1
365 freq.WriteVInt(1);
366 // set low bit of doc num.
367 else
369 freq.WriteVInt(0); // the document number
370 freq.WriteVInt(postingFreq); // frequency in doc
373 int lastPosition = 0; // write positions
374 int[] positions = posting.positions;
375 for (int j = 0; j < postingFreq; j++)
377 // use delta-encoding
378 int position = positions[j];
379 prox.WriteVInt(position - lastPosition);
380 lastPosition = position;
382 // check to see if we switched to a new field
383 System.String termField = posting.term.Field();
384 if (currentField != termField)
386 // changing field - see if there is something to save
387 currentField = termField;
388 FieldInfo fi = fieldInfos.FieldInfo(currentField);
389 if (fi.storeTermVector)
391 if (termVectorWriter == null)
393 termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
394 termVectorWriter.OpenDocument();
396 termVectorWriter.OpenField(currentField);
398 else if (termVectorWriter != null)
400 termVectorWriter.CloseField();
403 if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
405 termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
408 if (termVectorWriter != null)
409 termVectorWriter.CloseDocument();
411 finally
413 // make an effort to close all streams we can but remember and re-throw
414 // the first exception encountered in this process
415 System.IO.IOException keep = null;
416 if (freq != null)
419 freq.Close();
421 catch (System.IO.IOException e)
423 if (keep == null)
424 keep = e;
426 if (prox != null)
429 prox.Close();
431 catch (System.IO.IOException e)
433 if (keep == null)
434 keep = e;
436 if (tis != null)
439 tis.Close();
441 catch (System.IO.IOException e)
443 if (keep == null)
444 keep = e;
446 if (termVectorWriter != null)
449 termVectorWriter.Close();
451 catch (System.IO.IOException e)
453 if (keep == null)
454 keep = e;
456 if (keep != null)
458 throw new System.IO.IOException(keep.StackTrace);
463 private void WriteNorms(System.String segment)
465 for (int n = 0; n < fieldInfos.Size(); n++)
467 FieldInfo fi = fieldInfos.FieldInfo(n);
468 if (fi.isIndexed && !fi.omitNorms)
470 float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
471 IndexOutput norms = directory.CreateOutput(segment + ".f" + n);
474 norms.WriteByte(Similarity.EncodeNorm(norm));
476 finally
478 norms.Close();
484 /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
485 internal void SetInfoStream(System.IO.TextWriter infoStream)
487 this.infoStream = infoStream;
491 sealed class Posting
493 // info about a Term in a doc
494 internal Term term; // the Term
495 internal int freq; // its frequency in doc
496 internal int[] positions; // positions it occurs at
497 internal TermVectorOffsetInfo[] offsets;
499 internal Posting(Term t, int position, TermVectorOffsetInfo offset)
501 term = t;
502 freq = 1;
503 positions = new int[1];
504 positions[0] = position;
505 if (offset != null)
507 offsets = new TermVectorOffsetInfo[1];
508 offsets[0] = offset;
510 else
511 offsets = null;