cvsimport
[beagle.git] / beagled / Lucene.Net / Index / TermVectorsWriter.cs
blobe669e70bc6f57d2d384fa398ea9ec7272239653b
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Directory = Lucene.Net.Store.Directory;
19 using IndexOutput = Lucene.Net.Store.IndexOutput;
20 using StringHelper = Lucene.Net.Util.StringHelper;
22 namespace Lucene.Net.Index
25 /// <summary> Writer works by opening a document and then opening the fields within the document and then
26 /// writing out the vectors for each field.
27 ///
28 /// Rough usage:
29 ///
30 /// <CODE>
31 /// for each document
32 /// {
33 /// writer.openDocument();
34 /// for each field on the document
35 /// {
36 /// writer.openField(field);
37 /// for all of the terms
38 /// {
39 /// writer.addTerm(...)
40 /// }
41 /// writer.closeField
42 /// }
43 /// writer.closeDocument()
44 /// }
45 /// </CODE>
46 ///
47 /// </summary>
48 /// <version> $Id: TermVectorsWriter.cs,v 1.3 2006/10/02 17:09:00 joeshaw Exp $
49 ///
50 /// </version>
51 public sealed class TermVectorsWriter
53 internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
54 internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
56 internal const int FORMAT_VERSION = 2;
57 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
58 internal const int FORMAT_SIZE = 4;
60 internal const System.String TVX_EXTENSION = ".tvx";
61 internal const System.String TVD_EXTENSION = ".tvd";
62 internal const System.String TVF_EXTENSION = ".tvf";
64 private IndexOutput tvx = null, tvd = null, tvf = null;
65 private System.Collections.ArrayList fields = null;
66 private System.Collections.ArrayList terms = null;
67 private FieldInfos fieldInfos;
69 private TVField currentField = null;
70 private long currentDocPointer = - 1;
72 public static System.String TvxExtension
74 get { return TVX_EXTENSION; }
76 public static System.String TvdExtension
78 get { return TVD_EXTENSION; }
80 public static System.String TvfExtension
82 get { return TVF_EXTENSION; }
85 public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos)
87 // Open files for TermVector storage
88 tvx = directory.CreateOutput(segment + TVX_EXTENSION);
89 tvx.WriteInt(FORMAT_VERSION);
90 tvd = directory.CreateOutput(segment + TVD_EXTENSION);
91 tvd.WriteInt(FORMAT_VERSION);
92 tvf = directory.CreateOutput(segment + TVF_EXTENSION);
93 tvf.WriteInt(FORMAT_VERSION);
95 this.fieldInfos = fieldInfos;
96 fields = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(fieldInfos.Size()));
97 terms = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
101 public void OpenDocument()
103 CloseDocument();
104 currentDocPointer = tvd.GetFilePointer();
108 public void CloseDocument()
110 if (IsDocumentOpen())
112 CloseField();
113 WriteDoc();
114 fields.Clear();
115 currentDocPointer = - 1;
120 public bool IsDocumentOpen()
122 return currentDocPointer != - 1;
126 /// <summary>Start processing a field. This can be followed by a number of calls to
127 /// addTerm, and a final call to closeField to indicate the end of
128 /// processing of this field. If a field was previously open, it is
129 /// closed automatically.
130 /// </summary>
131 public void OpenField(System.String field)
133 FieldInfo fieldInfo = fieldInfos.FieldInfo(field);
134 OpenField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
137 private void OpenField(int fieldNumber, bool storePositionWithTermVector, bool storeOffsetWithTermVector)
139 if (!IsDocumentOpen())
140 throw new System.SystemException("Cannot open field when no document is open.");
141 CloseField();
142 currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
145 /// <summary>Finished processing current field. This should be followed by a call to
146 /// openField before future calls to addTerm.
147 /// </summary>
148 public void CloseField()
150 if (IsFieldOpen())
152 /* DEBUG */
153 //System.out.println("closeField()");
154 /* DEBUG */
156 // save field and terms
157 WriteField();
158 fields.Add(currentField);
159 terms.Clear();
160 currentField = null;
164 /// <summary>Return true if a field is currently open. </summary>
165 public bool IsFieldOpen()
167 return currentField != null;
170 /// <summary>Add term to the field's term vector. Field must already be open.
171 /// Terms should be added in
172 /// increasing order of terms, one call per unique termNum. ProxPointer
173 /// is a pointer into the TermPosition file (prx). Freq is the number of
174 /// times this term appears in this field, in this document.
175 /// </summary>
176 /// <throws> IllegalStateException if document or field is not open </throws>
177 public void AddTerm(System.String termText, int freq)
179 AddTerm(termText, freq, null, null);
182 public void AddTerm(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
184 if (!IsDocumentOpen())
185 throw new System.SystemException("Cannot add terms when document is not open");
186 if (!IsFieldOpen())
187 throw new System.SystemException("Cannot add terms when field is not open");
189 AddTermInternal(termText, freq, positions, offsets);
192 private void AddTermInternal(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
194 TVTerm term = new TVTerm();
195 term.termText = termText;
196 term.freq = freq;
197 term.positions = positions;
198 term.offsets = offsets;
199 terms.Add(term);
202 /// <summary> Add a complete document specified by all its term vectors. If document has no
203 /// term vectors, add value for tvx.
204 ///
205 /// </summary>
206 /// <param name="vectors">
207 /// </param>
208 /// <throws> IOException </throws>
209 public void AddAllDocVectors(TermFreqVector[] vectors)
211 OpenDocument();
213 if (vectors != null)
215 for (int i = 0; i < vectors.Length; i++)
217 bool storePositionWithTermVector = false;
218 bool storeOffsetWithTermVector = false;
223 TermPositionVector tpVector = (TermPositionVector) vectors[i];
225 if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
226 storePositionWithTermVector = true;
227 if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
228 storeOffsetWithTermVector = true;
230 FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
231 OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
233 for (int j = 0; j < tpVector.Size(); j++)
234 AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));
236 CloseField();
238 catch (System.InvalidCastException ignore)
241 TermFreqVector tfVector = vectors[i];
243 FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
244 OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
246 for (int j = 0; j < tfVector.Size(); j++)
247 AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);
249 CloseField();
254 CloseDocument();
257 /// <summary>Close all streams. </summary>
258 public /*internal*/ void Close()
262 CloseDocument();
264 finally
266 // make an effort to close all streams we can but remember and re-throw
267 // the first exception encountered in this process
268 System.IO.IOException keep = null;
269 if (tvx != null)
272 tvx.Close();
274 catch (System.IO.IOException e)
276 if (keep == null)
277 keep = e;
279 if (tvd != null)
282 tvd.Close();
284 catch (System.IO.IOException e)
286 if (keep == null)
287 keep = e;
289 if (tvf != null)
292 tvf.Close();
294 catch (System.IO.IOException e)
296 if (keep == null)
297 keep = e;
299 if (keep != null)
301 throw new System.IO.IOException(keep.StackTrace);
308 private void WriteField()
310 // remember where this field is written
311 currentField.tvfPointer = tvf.GetFilePointer();
312 //System.out.println("Field Pointer: " + currentField.tvfPointer);
314 int size = terms.Count;
315 tvf.WriteVInt(size);
317 bool storePositions = currentField.storePositions;
318 bool storeOffsets = currentField.storeOffsets;
319 byte bits = (byte) (0x0);
320 if (storePositions)
321 bits |= STORE_POSITIONS_WITH_TERMVECTOR;
322 if (storeOffsets)
323 bits |= STORE_OFFSET_WITH_TERMVECTOR;
324 tvf.WriteByte(bits);
326 System.String lastTermText = "";
327 for (int i = 0; i < size; i++)
329 TVTerm term = (TVTerm) terms[i];
330 int start = StringHelper.StringDifference(lastTermText, term.termText);
331 int length = term.termText.Length - start;
332 tvf.WriteVInt(start); // write shared prefix length
333 tvf.WriteVInt(length); // write delta length
334 tvf.WriteChars(term.termText, start, length); // write delta chars
335 tvf.WriteVInt(term.freq);
336 lastTermText = term.termText;
338 if (storePositions)
340 if (term.positions == null)
341 throw new System.SystemException("Trying to write positions that are null!");
343 // use delta encoding for positions
344 int position = 0;
345 for (int j = 0; j < term.freq; j++)
347 tvf.WriteVInt(term.positions[j] - position);
348 position = term.positions[j];
352 if (storeOffsets)
354 if (term.offsets == null)
355 throw new System.SystemException("Trying to write offsets that are null!");
357 // use delta encoding for offsets
358 int position = 0;
359 for (int j = 0; j < term.freq; j++)
361 tvf.WriteVInt(term.offsets[j].GetStartOffset() - position);
362 tvf.WriteVInt(term.offsets[j].GetEndOffset() - term.offsets[j].GetStartOffset()); //Save the diff between the two.
363 position = term.offsets[j].GetEndOffset();
369 private void WriteDoc()
371 if (IsFieldOpen())
372 throw new System.SystemException("Field is still open while writing document");
373 //System.out.println("Writing doc pointer: " + currentDocPointer);
374 // write document index record
375 tvx.WriteLong(currentDocPointer);
377 // write document data record
378 int size = fields.Count;
380 // write the number of fields
381 tvd.WriteVInt(size);
383 // write field numbers
384 for (int i = 0; i < size; i++)
386 TVField field = (TVField) fields[i];
387 tvd.WriteVInt(field.number);
390 // write field pointers
391 long lastFieldPointer = 0;
392 for (int i = 0; i < size; i++)
394 TVField field = (TVField) fields[i];
395 tvd.WriteVLong(field.tvfPointer - lastFieldPointer);
396 lastFieldPointer = field.tvfPointer;
398 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
402 private class TVField
404 internal int number;
405 internal long tvfPointer = 0;
406 internal bool storePositions = false;
407 internal bool storeOffsets = false;
408 internal TVField(int number, bool storePos, bool storeOff)
410 this.number = number;
411 storePositions = storePos;
412 storeOffsets = storeOff;
416 private class TVTerm
418 internal System.String termText;
419 internal int freq = 0;
420 internal int[] positions = null;
421 internal TermVectorOffsetInfo[] offsets = null;