2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Directory
= Lucene
.Net
.Store
.Directory
;
18 using OutputStream
= Lucene
.Net
.Store
.OutputStream
;
19 using StringHelper
= Lucene
.Net
.Util
.StringHelper
;
20 namespace Lucene
.Net
.Index
23 /// <summary> Writer works by opening a document and then opening the fields within the document and then
24 /// writing out the vectors for each Field.
31 /// writer.openDocument();
32 /// for each Field on the document
34 /// writer.openField(Field);
35 /// for all of the terms
37 /// writer.addTerm(...)
41 /// writer.closeDocument()
45 sealed public class TermVectorsWriter
47 public const int FORMAT_VERSION
= 1;
48 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
49 public const int FORMAT_SIZE
= 4;
51 //TODO: Figure out how to write with or w/o position information and read back in
52 public const System
.String TVX_EXTENSION
= ".tvx";
53 public const System
.String TVD_EXTENSION
= ".tvd";
54 public const System
.String TVF_EXTENSION
= ".tvf";
55 private OutputStream tvx
= null, tvd
= null, tvf
= null;
56 private System
.Collections
.ArrayList fields
= null;
57 private System
.Collections
.ArrayList terms
= null;
58 private FieldInfos fieldInfos
;
60 private TVField currentField
= null;
61 private long currentDocPointer
= - 1;
63 /// <summary>Create term vectors writer for the specified segment in specified
64 /// directory. A new TermVectorsWriter should be created for each
65 /// segment. The parameter <code>maxFields</code> indicates how many total
66 /// fields are found in this document. Not all of these fields may require
67 /// termvectors to be stored, so the number of calls to
68 /// <code>openField</code> is less or equal to this number.
70 public TermVectorsWriter(Directory directory
, System
.String segment
, FieldInfos fieldInfos
)
72 // Open files for TermVector storage
73 tvx
= directory
.CreateFile(segment
+ TVX_EXTENSION
);
74 tvx
.WriteInt(FORMAT_VERSION
);
75 tvd
= directory
.CreateFile(segment
+ TVD_EXTENSION
);
76 tvd
.WriteInt(FORMAT_VERSION
);
77 tvf
= directory
.CreateFile(segment
+ TVF_EXTENSION
);
78 tvf
.WriteInt(FORMAT_VERSION
);
80 this.fieldInfos
= fieldInfos
;
81 fields
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(fieldInfos
.Size()));
82 terms
= System
.Collections
.ArrayList
.Synchronized(new System
.Collections
.ArrayList(10));
86 public void OpenDocument()
90 currentDocPointer
= tvd
.GetFilePointer();
94 public void CloseDocument()
101 currentDocPointer
= - 1;
106 public bool IsDocumentOpen()
108 return currentDocPointer
!= - 1;
112 /// <summary>Start processing a Field. This can be followed by a number of calls to
113 /// addTerm, and a final call to closeField to indicate the end of
114 /// processing of this Field. If a Field was previously open, it is
115 /// closed automatically.
117 public void OpenField(System
.String field
)
119 if (!IsDocumentOpen())
120 throw new System
.SystemException("Cannot open Field when no document is open.");
123 currentField
= new TVField(fieldInfos
.FieldNumber(field
));
126 /// <summary>Finished processing current Field. This should be followed by a call to
127 /// openField before future calls to addTerm.
129 public void CloseField()
134 //System.out.println("closeField()");
137 // save Field and terms
139 fields
.Add(currentField
);
145 /// <summary>Return true if a Field is currently open. </summary>
146 public bool IsFieldOpen()
148 return currentField
!= null;
151 /// <summary>Add term to the Field's term vector. Field must already be open
152 /// of NullPointerException is thrown. Terms should be added in
153 /// increasing order of terms, one call per unique termNum. ProxPointer
154 /// is a pointer into the TermPosition file (prx). Freq is the number of
155 /// times this term appears in this Field, in this document.
157 public void AddTerm(System
.String termText
, int freq
)
159 if (!IsDocumentOpen())
160 throw new System
.SystemException("Cannot add terms when document is not open");
162 throw new System
.SystemException("Cannot add terms when Field is not open");
164 AddTermInternal(termText
, freq
);
167 private void AddTermInternal(System
.String termText
, int freq
)
169 currentField
.length
+= freq
;
170 TVTerm term
= new TVTerm();
171 term
.termText
= termText
;
177 /// <summary>Add specified vectors to the document.</summary>
178 public void AddVectors(TermFreqVector
[] vectors
)
180 if (!IsDocumentOpen())
181 throw new System
.SystemException("Cannot add term vectors when document is not open");
183 throw new System
.SystemException("Cannot add term vectors when Field is open");
185 for (int i
= 0; i
< vectors
.Length
; i
++)
187 AddTermFreqVector(vectors
[i
]);
192 /// <summary>Add specified vector to the document. Document must be open but no Field
193 /// should be open or exception is thrown. The same document can have <code>addTerm</code>
194 /// and <code>addVectors</code> calls mixed, however a given Field must either be
195 /// populated with <code>addTerm</code> or with <code>addVector</code>. *
197 public void AddTermFreqVector(TermFreqVector vector
)
199 if (!IsDocumentOpen())
200 throw new System
.SystemException("Cannot add term vector when document is not open");
202 throw new System
.SystemException("Cannot add term vector when Field is open");
203 AddTermFreqVectorInternal(vector
);
206 private void AddTermFreqVectorInternal(TermFreqVector vector
)
208 OpenField(vector
.GetField());
209 for (int i
= 0; i
< vector
.Size(); i
++)
211 AddTermInternal(vector
.GetTerms()[i
], vector
.GetTermFrequencies()[i
]);
219 /// <summary>Close all streams. </summary>
220 public /*internal*/ void Close()
228 // make an effort to close all streams we can but remember and re-throw
229 // the first exception encountered in this process
230 System
.IO
.IOException keep
= null;
236 catch (System
.IO
.IOException e
)
246 catch (System
.IO
.IOException e
)
256 catch (System
.IO
.IOException e
)
263 throw new System
.IO
.IOException(keep
.StackTrace
);
270 private void WriteField()
272 // remember where this Field is written
273 currentField
.tvfPointer
= tvf
.GetFilePointer();
274 //System.out.println("Field Pointer: " + currentField.tvfPointer);
277 tvf
.WriteVInt(size
= terms
.Count
);
278 tvf
.WriteVInt(currentField
.length
- size
);
279 System
.String lastTermText
= "";
280 // write term ids and positions
281 for (int i
= 0; i
< size
; i
++)
283 TVTerm term
= (TVTerm
) terms
[i
];
284 //tvf.writeString(term.termText);
285 int start
= StringHelper
.StringDifference(lastTermText
, term
.termText
);
286 int length
= term
.termText
.Length
- start
;
287 tvf
.WriteVInt(start
); // write shared prefix length
288 tvf
.WriteVInt(length
); // write delta length
289 tvf
.WriteChars(term
.termText
, start
, length
); // write delta chars
290 tvf
.WriteVInt(term
.freq
);
291 lastTermText
= term
.termText
;
298 private void WriteDoc()
301 throw new System
.SystemException("Field is still open while writing document");
302 //System.out.println("Writing doc pointer: " + currentDocPointer);
303 // write document index record
304 tvx
.WriteLong(currentDocPointer
);
306 // write document data record
309 // write the number of fields
310 tvd
.WriteVInt(size
= fields
.Count
);
312 // write Field numbers
313 int lastFieldNumber
= 0;
314 for (int i
= 0; i
< size
; i
++)
316 TVField field
= (TVField
) fields
[i
];
317 tvd
.WriteVInt(field
.number
- lastFieldNumber
);
319 lastFieldNumber
= field
.number
;
322 // write Field pointers
323 long lastFieldPointer
= 0;
324 for (int i
= 0; i
< size
; i
++)
326 TVField field
= (TVField
) fields
[i
];
327 tvd
.WriteVLong(field
.tvfPointer
- lastFieldPointer
);
329 lastFieldPointer
= field
.tvfPointer
;
331 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
335 private class TVField
338 internal long tvfPointer
= 0;
339 internal int length
= 0; // number of distinct term positions
341 internal TVField(int number
)
343 this.number
= number
;
349 internal System
.String termText
;
350 internal int freq
= 0;
351 //int positions[] = null;