2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Directory
= Lucene
.Net
.Store
.Directory
;
18 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
19 using StringHelper
= Lucene
.Net
.Util
.StringHelper
;
20 namespace Lucene
.Net
.Index
23 /// <summary> Writer works by opening a document and then opening the fields within the document and then
24 /// writing out the vectors for each Field.
31 /// writer.openDocument();
32 /// for each Field on the document
34 /// writer.openField(Field);
35 /// for all of the terms
37 /// writer.addTerm(...)
41 /// writer.closeDocument()
46 /// <version> $Id: TermVectorsWriter.cs,v 1.2 2005/10/06 19:29:56 dsd Exp $
49 sealed public class TermVectorsWriter
51 internal const byte STORE_POSITIONS_WITH_TERMVECTOR
= (byte) (0x1);
52 internal const byte STORE_OFFSET_WITH_TERMVECTOR
= (byte) (0x2);
54 internal const int FORMAT_VERSION
= 2;
55 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
56 internal const int FORMAT_SIZE
= 4;
58 internal const System
.String TVX_EXTENSION
= ".tvx";
59 internal const System
.String TVD_EXTENSION
= ".tvd";
60 internal const System
.String TVF_EXTENSION
= ".tvf";
62 private IndexOutput tvx
= null, tvd
= null, tvf
= null;
63 private System
.Collections
.ArrayList fields
= null;
64 private System
.Collections
.ArrayList terms
= null;
65 private FieldInfos fieldInfos
;
67 private TVField currentField
= null;
68 private long currentDocPointer
= - 1;
70 public TermVectorsWriter(Directory directory
, System
.String segment
, FieldInfos fieldInfos
)
72 // Open files for TermVector storage
73 tvx
= directory
.CreateOutput(segment
+ TVX_EXTENSION
);
74 tvx
.WriteInt(FORMAT_VERSION
);
75 tvd
= directory
.CreateOutput(segment
+ TVD_EXTENSION
);
76 tvd
.WriteInt(FORMAT_VERSION
);
77 tvf
= directory
.CreateOutput(segment
+ TVF_EXTENSION
);
78 tvf
.WriteInt(FORMAT_VERSION
);
80 this.fieldInfos
= fieldInfos
;
81 fields
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(fieldInfos
.Size()));
82 terms
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
86 public void OpenDocument()
89 currentDocPointer
= tvd
.GetFilePointer();
93 public void CloseDocument()
100 currentDocPointer
= - 1;
105 public bool IsDocumentOpen()
107 return currentDocPointer
!= - 1;
111 /// <summary>Start processing a Field. This can be followed by a number of calls to
112 /// addTerm, and a final call to closeField to indicate the end of
113 /// processing of this Field. If a Field was previously open, it is
114 /// closed automatically.
116 public void OpenField(System
.String field
)
118 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(field
);
119 OpenField(fieldInfo
.number
, fieldInfo
.storePositionWithTermVector
, fieldInfo
.storeOffsetWithTermVector
);
122 private void OpenField(int fieldNumber
, bool storePositionWithTermVector
, bool storeOffsetWithTermVector
)
124 if (!IsDocumentOpen())
125 throw new System
.SystemException("Cannot open field when no document is open.");
127 currentField
= new TVField(fieldNumber
, storePositionWithTermVector
, storeOffsetWithTermVector
);
130 /// <summary>Finished processing current Field. This should be followed by a call to
131 /// openField before future calls to addTerm.
133 public void CloseField()
138 //System.out.println("closeField()");
141 // save Field and terms
143 fields
.Add(currentField
);
149 /// <summary>Return true if a Field is currently open. </summary>
150 public bool IsFieldOpen()
152 return currentField
!= null;
155 /// <summary>Add term to the field's term vector. Field must already be open.
156 /// Terms should be added in
157 /// increasing order of terms, one call per unique termNum. ProxPointer
158 /// is a pointer into the TermPosition file (prx). Freq is the number of
159 /// times this term appears in this field, in this document.
161 /// <throws> IllegalStateException if document or field is not open </throws>
162 public void AddTerm(System
.String termText
, int freq
)
164 AddTerm(termText
, freq
, null, null);
167 public void AddTerm(System
.String termText
, int freq
, int[] positions
, TermVectorOffsetInfo
[] offsets
)
169 if (!IsDocumentOpen())
170 throw new System
.SystemException("Cannot add terms when document is not open");
172 throw new System
.SystemException("Cannot add terms when field is not open");
174 AddTermInternal(termText
, freq
, positions
, offsets
);
177 private void AddTermInternal(System
.String termText
, int freq
, int[] positions
, TermVectorOffsetInfo
[] offsets
)
179 TVTerm term
= new TVTerm();
180 term
.termText
= termText
;
182 term
.positions
= positions
;
183 term
.offsets
= offsets
;
188 /// <summary> Add a complete document specified by all its term vectors. If document has no
189 /// term vectors, add value for tvx.
192 /// <param name="">vectors
194 /// <throws> IOException </throws>
195 public void AddAllDocVectors(TermFreqVector
[] vectors
)
201 for (int i
= 0; i
< vectors
.Length
; i
++)
203 bool storePositionWithTermVector
= false;
204 bool storeOffsetWithTermVector
= false;
209 TermPositionVector tpVector
= (TermPositionVector
) vectors
[i
];
211 if (tpVector
.Size() > 0 && tpVector
.GetTermPositions(0) != null)
212 storePositionWithTermVector
= true;
213 if (tpVector
.Size() > 0 && tpVector
.GetOffsets(0) != null)
214 storeOffsetWithTermVector
= true;
216 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(tpVector
.GetField());
217 OpenField(fieldInfo
.number
, storePositionWithTermVector
, storeOffsetWithTermVector
);
219 for (int j
= 0; j
< tpVector
.Size(); j
++)
220 AddTermInternal(tpVector
.GetTerms()[j
], tpVector
.GetTermFrequencies()[j
], tpVector
.GetTermPositions(j
), tpVector
.GetOffsets(j
));
224 catch (System
.InvalidCastException ignore
)
227 TermFreqVector tfVector
= vectors
[i
];
229 FieldInfo fieldInfo
= fieldInfos
.FieldInfo(tfVector
.GetField());
230 OpenField(fieldInfo
.number
, storePositionWithTermVector
, storeOffsetWithTermVector
);
232 for (int j
= 0; j
< tfVector
.Size(); j
++)
233 AddTermInternal(tfVector
.GetTerms()[j
], tfVector
.GetTermFrequencies()[j
], null, null);
243 /// <summary>Close all streams. </summary>
244 public /*internal*/ void Close()
252 // make an effort to close all streams we can but remember and re-throw
253 // the first exception encountered in this process
254 System
.IO
.IOException keep
= null;
260 catch (System
.IO
.IOException e
)
270 catch (System
.IO
.IOException e
)
280 catch (System
.IO
.IOException e
)
287 throw new System
.IO
.IOException(keep
.StackTrace
);
294 private void WriteField()
296 // remember where this field is written
297 currentField
.tvfPointer
= tvf
.GetFilePointer();
298 //System.out.println("Field Pointer: " + currentField.tvfPointer);
300 int size
= terms
.Count
;
303 bool storePositions
= currentField
.storePositions
;
304 bool storeOffsets
= currentField
.storeOffsets
;
305 byte bits
= (byte) (0x0);
307 bits
|= STORE_POSITIONS_WITH_TERMVECTOR
;
309 bits
|= STORE_OFFSET_WITH_TERMVECTOR
;
312 System
.String lastTermText
= "";
313 for (int i
= 0; i
< size
; i
++)
315 TVTerm term
= (TVTerm
) terms
[i
];
316 int start
= StringHelper
.StringDifference(lastTermText
, term
.termText
);
317 int length
= term
.termText
.Length
- start
;
318 tvf
.WriteVInt(start
); // write shared prefix length
319 tvf
.WriteVInt(length
); // write delta length
320 tvf
.WriteChars(term
.termText
, start
, length
); // write delta chars
321 tvf
.WriteVInt(term
.freq
);
322 lastTermText
= term
.termText
;
326 if (term
.positions
== null)
327 throw new System
.SystemException("Trying to write positions that are null!");
329 // use delta encoding for positions
331 for (int j
= 0; j
< term
.freq
; j
++)
333 tvf
.WriteVInt(term
.positions
[j
] - position
);
334 position
= term
.positions
[j
];
340 if (term
.offsets
== null)
341 throw new System
.SystemException("Trying to write offsets that are null!");
343 // use delta encoding for offsets
345 for (int j
= 0; j
< term
.freq
; j
++)
347 tvf
.WriteVInt(term
.offsets
[j
].GetStartOffset() - position
);
348 tvf
.WriteVInt(term
.offsets
[j
].GetEndOffset() - term
.offsets
[j
].GetStartOffset()); //Save the diff between the two.
349 position
= term
.offsets
[j
].GetEndOffset();
355 private void WriteDoc()
358 throw new System
.SystemException("Field is still open while writing document");
359 //System.out.println("Writing doc pointer: " + currentDocPointer);
360 // write document index record
361 tvx
.WriteLong(currentDocPointer
);
363 // write document data record
364 //UPGRADE_NOTE: Final was removed from the declaration of 'size '. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1003_3"'
365 int size
= fields
.Count
;
367 // write the number of fields
370 // write field numbers
371 for (int i
= 0; i
< size
; i
++)
373 TVField field
= (TVField
) fields
[i
];
374 tvd
.WriteVInt(field
.number
);
377 // write field pointers
378 long lastFieldPointer
= 0;
379 for (int i
= 0; i
< size
; i
++)
381 TVField field
= (TVField
) fields
[i
];
382 tvd
.WriteVLong(field
.tvfPointer
- lastFieldPointer
);
383 lastFieldPointer
= field
.tvfPointer
;
385 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
389 private class TVField
392 internal long tvfPointer
= 0;
393 internal bool storePositions
= false;
394 internal bool storeOffsets
= false;
395 internal TVField(int number
, bool storePos
, bool storeOff
)
397 this.number
= number
;
398 storePositions
= storePos
;
399 storeOffsets
= storeOff
;
405 internal System
.String termText
;
406 internal int freq
= 0;
407 internal int[] positions
= null;
408 internal TermVectorOffsetInfo
[] offsets
= null;