2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 using Directory
= Lucene
.Net
.Store
.Directory
;
19 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
20 using StringHelper
= Lucene
.Net
.Util
.StringHelper
;
22 namespace Lucene
.Net
.Index
25 /// <summary> Writer works by opening a document and then opening the fields within the document and then
26 /// writing out the vectors for each field.
33 /// writer.openDocument();
34 /// for each field on the document
36 /// writer.openField(field);
37 /// for all of the terms
39 /// writer.addTerm(...)
43 /// writer.closeDocument()
48 /// <version> $Id: TermVectorsWriter.cs,v 1.3 2006/10/02 17:09:00 joeshaw Exp $
51 public sealed class TermVectorsWriter
53 internal const byte STORE_POSITIONS_WITH_TERMVECTOR
= (byte) (0x1);
54 internal const byte STORE_OFFSET_WITH_TERMVECTOR
= (byte) (0x2);
56 internal const int FORMAT_VERSION
= 2;
57 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
58 internal const int FORMAT_SIZE
= 4;
60 internal const System
.String TVX_EXTENSION
= ".tvx";
61 internal const System
.String TVD_EXTENSION
= ".tvd";
62 internal const System
.String TVF_EXTENSION
= ".tvf";
64 private IndexOutput tvx
= null, tvd
= null, tvf
= null;
65 private System
.Collections
.ArrayList fields
= null;
66 private System
.Collections
.ArrayList terms
= null;
67 private FieldInfos fieldInfos
;
69 private TVField currentField
= null;
70 private long currentDocPointer
= - 1;
72 public static System
.String TvxExtension
74 get { return TVX_EXTENSION; }
76 public static System
.String TvdExtension
78 get { return TVD_EXTENSION; }
80 public static System
.String TvfExtension
82 get { return TVF_EXTENSION; }
85 public TermVectorsWriter(Directory directory
, System
.String segment
, FieldInfos fieldInfos
)
87 // Open files for TermVector storage
88 tvx
= directory
.CreateOutput(segment
+ TVX_EXTENSION
);
89 tvx
.WriteInt(FORMAT_VERSION
);
90 tvd
= directory
.CreateOutput(segment
+ TVD_EXTENSION
);
91 tvd
.WriteInt(FORMAT_VERSION
);
92 tvf
= directory
.CreateOutput(segment
+ TVF_EXTENSION
);
93 tvf
.WriteInt(FORMAT_VERSION
);
95 this.fieldInfos
= fieldInfos
;
96 fields
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(fieldInfos
.Size()));
97 terms
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
101 public void OpenDocument()
104 currentDocPointer
= tvd
.GetFilePointer();
108 public void CloseDocument()
110 if (IsDocumentOpen())
115 currentDocPointer
= - 1;
120 public bool IsDocumentOpen()
122 return currentDocPointer
!= - 1;
126 /// <summary>Start processing a field. This can be followed by a number of calls to
127 /// addTerm, and a final call to closeField to indicate the end of
128 /// processing of this field. If a field was previously open, it is
129 /// closed automatically.
131 public void OpenField(System
.String field
)
133 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(field
);
134 OpenField(fieldInfo
.number
, fieldInfo
.storePositionWithTermVector
, fieldInfo
.storeOffsetWithTermVector
);
137 private void OpenField(int fieldNumber
, bool storePositionWithTermVector
, bool storeOffsetWithTermVector
)
139 if (!IsDocumentOpen())
140 throw new System
.SystemException("Cannot open field when no document is open.");
142 currentField
= new TVField(fieldNumber
, storePositionWithTermVector
, storeOffsetWithTermVector
);
145 /// <summary>Finished processing current field. This should be followed by a call to
146 /// openField before future calls to addTerm.
148 public void CloseField()
153 //System.out.println("closeField()");
156 // save field and terms
158 fields
.Add(currentField
);
164 /// <summary>Return true if a field is currently open. </summary>
165 public bool IsFieldOpen()
167 return currentField
!= null;
170 /// <summary>Add term to the field's term vector. Field must already be open.
171 /// Terms should be added in
172 /// increasing order of terms, one call per unique termNum. ProxPointer
173 /// is a pointer into the TermPosition file (prx). Freq is the number of
174 /// times this term appears in this field, in this document.
176 /// <throws> IllegalStateException if document or field is not open </throws>
177 public void AddTerm(System
.String termText
, int freq
)
179 AddTerm(termText
, freq
, null, null);
182 public void AddTerm(System
.String termText
, int freq
, int[] positions
, TermVectorOffsetInfo
[] offsets
)
184 if (!IsDocumentOpen())
185 throw new System
.SystemException("Cannot add terms when document is not open");
187 throw new System
.SystemException("Cannot add terms when field is not open");
189 AddTermInternal(termText
, freq
, positions
, offsets
);
192 private void AddTermInternal(System
.String termText
, int freq
, int[] positions
, TermVectorOffsetInfo
[] offsets
)
194 TVTerm term
= new TVTerm();
195 term
.termText
= termText
;
197 term
.positions
= positions
;
198 term
.offsets
= offsets
;
202 /// <summary> Add a complete document specified by all its term vectors. If document has no
203 /// term vectors, add value for tvx.
206 /// <param name="vectors">
208 /// <throws> IOException </throws>
209 public void AddAllDocVectors(TermFreqVector
[] vectors
)
215 for (int i
= 0; i
< vectors
.Length
; i
++)
217 bool storePositionWithTermVector
= false;
218 bool storeOffsetWithTermVector
= false;
223 TermPositionVector tpVector
= (TermPositionVector
) vectors
[i
];
225 if (tpVector
.Size() > 0 && tpVector
.GetTermPositions(0) != null)
226 storePositionWithTermVector
= true;
227 if (tpVector
.Size() > 0 && tpVector
.GetOffsets(0) != null)
228 storeOffsetWithTermVector
= true;
230 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(tpVector
.GetField());
231 OpenField(fieldInfo
.number
, storePositionWithTermVector
, storeOffsetWithTermVector
);
233 for (int j
= 0; j
< tpVector
.Size(); j
++)
234 AddTermInternal(tpVector
.GetTerms()[j
], tpVector
.GetTermFrequencies()[j
], tpVector
.GetTermPositions(j
), tpVector
.GetOffsets(j
));
238 catch (System
.InvalidCastException ignore
)
241 TermFreqVector tfVector
= vectors
[i
];
243 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(tfVector
.GetField());
244 OpenField(fieldInfo
.number
, storePositionWithTermVector
, storeOffsetWithTermVector
);
246 for (int j
= 0; j
< tfVector
.Size(); j
++)
247 AddTermInternal(tfVector
.GetTerms()[j
], tfVector
.GetTermFrequencies()[j
], null, null);
257 /// <summary>Close all streams. </summary>
258 public /*internal*/ void Close()
266 // make an effort to close all streams we can but remember and re-throw
267 // the first exception encountered in this process
268 System
.IO
.IOException keep
= null;
274 catch (System
.IO
.IOException e
)
284 catch (System
.IO
.IOException e
)
294 catch (System
.IO
.IOException e
)
301 throw new System
.IO
.IOException(keep
.StackTrace
);
308 private void WriteField()
310 // remember where this field is written
311 currentField
.tvfPointer
= tvf
.GetFilePointer();
312 //System.out.println("Field Pointer: " + currentField.tvfPointer);
314 int size
= terms
.Count
;
317 bool storePositions
= currentField
.storePositions
;
318 bool storeOffsets
= currentField
.storeOffsets
;
319 byte bits
= (byte) (0x0);
321 bits
|= STORE_POSITIONS_WITH_TERMVECTOR
;
323 bits
|= STORE_OFFSET_WITH_TERMVECTOR
;
326 System
.String lastTermText
= "";
327 for (int i
= 0; i
< size
; i
++)
329 TVTerm term
= (TVTerm
) terms
[i
];
330 int start
= StringHelper
.StringDifference(lastTermText
, term
.termText
);
331 int length
= term
.termText
.Length
- start
;
332 tvf
.WriteVInt(start
); // write shared prefix length
333 tvf
.WriteVInt(length
); // write delta length
334 tvf
.WriteChars(term
.termText
, start
, length
); // write delta chars
335 tvf
.WriteVInt(term
.freq
);
336 lastTermText
= term
.termText
;
340 if (term
.positions
== null)
341 throw new System
.SystemException("Trying to write positions that are null!");
343 // use delta encoding for positions
345 for (int j
= 0; j
< term
.freq
; j
++)
347 tvf
.WriteVInt(term
.positions
[j
] - position
);
348 position
= term
.positions
[j
];
354 if (term
.offsets
== null)
355 throw new System
.SystemException("Trying to write offsets that are null!");
357 // use delta encoding for offsets
359 for (int j
= 0; j
< term
.freq
; j
++)
361 tvf
.WriteVInt(term
.offsets
[j
].GetStartOffset() - position
);
362 tvf
.WriteVInt(term
.offsets
[j
].GetEndOffset() - term
.offsets
[j
].GetStartOffset()); //Save the diff between the two.
363 position
= term
.offsets
[j
].GetEndOffset();
369 private void WriteDoc()
372 throw new System
.SystemException("Field is still open while writing document");
373 //System.out.println("Writing doc pointer: " + currentDocPointer);
374 // write document index record
375 tvx
.WriteLong(currentDocPointer
);
377 // write document data record
378 int size
= fields
.Count
;
380 // write the number of fields
383 // write field numbers
384 for (int i
= 0; i
< size
; i
++)
386 TVField field
= (TVField
) fields
[i
];
387 tvd
.WriteVInt(field
.number
);
390 // write field pointers
391 long lastFieldPointer
= 0;
392 for (int i
= 0; i
< size
; i
++)
394 TVField field
= (TVField
) fields
[i
];
395 tvd
.WriteVLong(field
.tvfPointer
- lastFieldPointer
);
396 lastFieldPointer
= field
.tvfPointer
;
398 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
402 private class TVField
405 internal long tvfPointer
= 0;
406 internal bool storePositions
= false;
407 internal bool storeOffsets
= false;
408 internal TVField(int number
, bool storePos
, bool storeOff
)
410 this.number
= number
;
411 storePositions
= storePos
;
412 storeOffsets
= storeOff
;
418 internal System
.String termText
;
419 internal int freq
= 0;
420 internal int[] positions
= null;
421 internal TermVectorOffsetInfo
[] offsets
= null;