Update the thread-local storage patch, to fix #335178
[beagle.git] / beagled / Lucene.Net / Index / TermVectorsWriter.cs
blobf67fcef3cef83e470bf59f65a36f61d5995a79c6
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Directory = Lucene.Net.Store.Directory;
18 using IndexOutput = Lucene.Net.Store.IndexOutput;
19 using StringHelper = Lucene.Net.Util.StringHelper;
20 namespace Lucene.Net.Index
23 /// <summary> Writer works by opening a document and then opening the fields within the document and then
24 /// writing out the vectors for each Field.
25 ///
26 /// Rough usage:
27 ///
28 /// <CODE>
29 /// for each document
30 /// {
31 /// writer.openDocument();
32 /// for each Field on the document
33 /// {
34 /// writer.openField(Field);
35 /// for all of the terms
36 /// {
37 /// writer.addTerm(...)
38 /// }
39 /// writer.closeField
40 /// }
41 /// writer.closeDocument()
42 /// }
43 /// </CODE>
44 ///
45 /// </summary>
46 /// <version> $Id: TermVectorsWriter.cs,v 1.2 2005/10/06 19:29:56 dsd Exp $
47 ///
48 /// </version>
49 sealed public class TermVectorsWriter
51 internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
52 internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
54 internal const int FORMAT_VERSION = 2;
55 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
56 internal const int FORMAT_SIZE = 4;
58 internal const System.String TVX_EXTENSION = ".tvx";
59 internal const System.String TVD_EXTENSION = ".tvd";
60 internal const System.String TVF_EXTENSION = ".tvf";
62 private IndexOutput tvx = null, tvd = null, tvf = null;
63 private System.Collections.ArrayList fields = null;
64 private System.Collections.ArrayList terms = null;
65 private FieldInfos fieldInfos;
67 private TVField currentField = null;
68 private long currentDocPointer = - 1;
70 public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos)
72 // Open files for TermVector storage
73 tvx = directory.CreateOutput(segment + TVX_EXTENSION);
74 tvx.WriteInt(FORMAT_VERSION);
75 tvd = directory.CreateOutput(segment + TVD_EXTENSION);
76 tvd.WriteInt(FORMAT_VERSION);
77 tvf = directory.CreateOutput(segment + TVF_EXTENSION);
78 tvf.WriteInt(FORMAT_VERSION);
80 this.fieldInfos = fieldInfos;
81 fields = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(fieldInfos.Size()));
82 terms = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
86 public void OpenDocument()
88 CloseDocument();
89 currentDocPointer = tvd.GetFilePointer();
93 public void CloseDocument()
95 if (IsDocumentOpen())
97 CloseField();
98 WriteDoc();
99 fields.Clear();
100 currentDocPointer = - 1;
105 public bool IsDocumentOpen()
107 return currentDocPointer != - 1;
111 /// <summary>Start processing a Field. This can be followed by a number of calls to
112 /// addTerm, and a final call to closeField to indicate the end of
113 /// processing of this Field. If a Field was previously open, it is
114 /// closed automatically.
115 /// </summary>
116 public void OpenField(System.String field)
118 FieldInfo fieldInfo = fieldInfos.FieldInfo(field);
119 OpenField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
122 private void OpenField(int fieldNumber, bool storePositionWithTermVector, bool storeOffsetWithTermVector)
124 if (!IsDocumentOpen())
125 throw new System.SystemException("Cannot open field when no document is open.");
126 CloseField();
127 currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
130 /// <summary>Finished processing current Field. This should be followed by a call to
131 /// openField before future calls to addTerm.
132 /// </summary>
133 public void CloseField()
135 if (IsFieldOpen())
137 /* DEBUG */
138 //System.out.println("closeField()");
139 /* DEBUG */
141 // save Field and terms
142 WriteField();
143 fields.Add(currentField);
144 terms.Clear();
145 currentField = null;
149 /// <summary>Return true if a Field is currently open. </summary>
150 public bool IsFieldOpen()
152 return currentField != null;
155 /// <summary>Add term to the field's term vector. Field must already be open.
156 /// Terms should be added in
157 /// increasing order of terms, one call per unique termNum. ProxPointer
158 /// is a pointer into the TermPosition file (prx). Freq is the number of
159 /// times this term appears in this field, in this document.
160 /// </summary>
161 /// <throws> IllegalStateException if document or field is not open </throws>
162 public void AddTerm(System.String termText, int freq)
164 AddTerm(termText, freq, null, null);
167 public void AddTerm(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
169 if (!IsDocumentOpen())
170 throw new System.SystemException("Cannot add terms when document is not open");
171 if (!IsFieldOpen())
172 throw new System.SystemException("Cannot add terms when field is not open");
174 AddTermInternal(termText, freq, positions, offsets);
177 private void AddTermInternal(System.String termText, int freq, int[] positions, TermVectorOffsetInfo[] offsets)
179 TVTerm term = new TVTerm();
180 term.termText = termText;
181 term.freq = freq;
182 term.positions = positions;
183 term.offsets = offsets;
184 terms.Add(term);
188 /// <summary> Add a complete document specified by all its term vectors. If document has no
189 /// term vectors, add value for tvx.
190 ///
191 /// </summary>
192 /// <param name="">vectors
193 /// </param>
194 /// <throws> IOException </throws>
195 public void AddAllDocVectors(TermFreqVector[] vectors)
197 OpenDocument();
199 if (vectors != null)
201 for (int i = 0; i < vectors.Length; i++)
203 bool storePositionWithTermVector = false;
204 bool storeOffsetWithTermVector = false;
209 TermPositionVector tpVector = (TermPositionVector) vectors[i];
211 if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
212 storePositionWithTermVector = true;
213 if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
214 storeOffsetWithTermVector = true;
216 FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
217 OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
219 for (int j = 0; j < tpVector.Size(); j++)
220 AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));
222 CloseField();
224 catch (System.InvalidCastException ignore)
227 TermFreqVector tfVector = vectors[i];
229 FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
230 OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
232 for (int j = 0; j < tfVector.Size(); j++)
233 AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);
235 CloseField();
240 CloseDocument();
243 /// <summary>Close all streams. </summary>
244 public /*internal*/ void Close()
248 CloseDocument();
250 finally
252 // make an effort to close all streams we can but remember and re-throw
253 // the first exception encountered in this process
254 System.IO.IOException keep = null;
255 if (tvx != null)
258 tvx.Close();
260 catch (System.IO.IOException e)
262 if (keep == null)
263 keep = e;
265 if (tvd != null)
268 tvd.Close();
270 catch (System.IO.IOException e)
272 if (keep == null)
273 keep = e;
275 if (tvf != null)
278 tvf.Close();
280 catch (System.IO.IOException e)
282 if (keep == null)
283 keep = e;
285 if (keep != null)
287 throw new System.IO.IOException(keep.StackTrace);
294 private void WriteField()
296 // remember where this field is written
297 currentField.tvfPointer = tvf.GetFilePointer();
298 //System.out.println("Field Pointer: " + currentField.tvfPointer);
300 int size = terms.Count;
301 tvf.WriteVInt(size);
303 bool storePositions = currentField.storePositions;
304 bool storeOffsets = currentField.storeOffsets;
305 byte bits = (byte) (0x0);
306 if (storePositions)
307 bits |= STORE_POSITIONS_WITH_TERMVECTOR;
308 if (storeOffsets)
309 bits |= STORE_OFFSET_WITH_TERMVECTOR;
310 tvf.WriteByte(bits);
312 System.String lastTermText = "";
313 for (int i = 0; i < size; i++)
315 TVTerm term = (TVTerm) terms[i];
316 int start = StringHelper.StringDifference(lastTermText, term.termText);
317 int length = term.termText.Length - start;
318 tvf.WriteVInt(start); // write shared prefix length
319 tvf.WriteVInt(length); // write delta length
320 tvf.WriteChars(term.termText, start, length); // write delta chars
321 tvf.WriteVInt(term.freq);
322 lastTermText = term.termText;
324 if (storePositions)
326 if (term.positions == null)
327 throw new System.SystemException("Trying to write positions that are null!");
329 // use delta encoding for positions
330 int position = 0;
331 for (int j = 0; j < term.freq; j++)
333 tvf.WriteVInt(term.positions[j] - position);
334 position = term.positions[j];
338 if (storeOffsets)
340 if (term.offsets == null)
341 throw new System.SystemException("Trying to write offsets that are null!");
343 // use delta encoding for offsets
344 int position = 0;
345 for (int j = 0; j < term.freq; j++)
347 tvf.WriteVInt(term.offsets[j].GetStartOffset() - position);
348 tvf.WriteVInt(term.offsets[j].GetEndOffset() - term.offsets[j].GetStartOffset()); //Save the diff between the two.
349 position = term.offsets[j].GetEndOffset();
355 private void WriteDoc()
357 if (IsFieldOpen())
358 throw new System.SystemException("Field is still open while writing document");
359 //System.out.println("Writing doc pointer: " + currentDocPointer);
360 // write document index record
361 tvx.WriteLong(currentDocPointer);
363 // write document data record
364 //UPGRADE_NOTE: Final was removed from the declaration of 'size '. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1003_3"'
365 int size = fields.Count;
367 // write the number of fields
368 tvd.WriteVInt(size);
370 // write field numbers
371 for (int i = 0; i < size; i++)
373 TVField field = (TVField) fields[i];
374 tvd.WriteVInt(field.number);
377 // write field pointers
378 long lastFieldPointer = 0;
379 for (int i = 0; i < size; i++)
381 TVField field = (TVField) fields[i];
382 tvd.WriteVLong(field.tvfPointer - lastFieldPointer);
383 lastFieldPointer = field.tvfPointer;
385 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
389 private class TVField
391 internal int number;
392 internal long tvfPointer = 0;
393 internal bool storePositions = false;
394 internal bool storeOffsets = false;
395 internal TVField(int number, bool storePos, bool storeOff)
397 this.number = number;
398 storePositions = storePos;
399 storeOffsets = storeOff;
403 private class TVTerm
405 internal System.String termText;
406 internal int freq = 0;
407 internal int[] positions = null;
408 internal TermVectorOffsetInfo[] offsets = null;