Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Index / TermVectorsWriter.cs
blobf0dfafabaae94c1d7dee5556ce9f4607db455577
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Directory = Lucene.Net.Store.Directory;
18 using OutputStream = Lucene.Net.Store.OutputStream;
19 using StringHelper = Lucene.Net.Util.StringHelper;
20 namespace Lucene.Net.Index
23 /// <summary> Writer works by opening a document and then opening the fields within the document and then
24 /// writing out the vectors for each Field.
25 ///
26 /// Rough usage:
27 ///
28 /// <CODE>
29 /// for each document
30 /// {
31 /// writer.openDocument();
32 /// for each Field on the document
33 /// {
34 /// writer.openField(Field);
35 /// for all of the terms
36 /// {
37 /// writer.addTerm(...)
38 /// }
39 /// writer.closeField
40 /// }
41 /// writer.closeDocument()
42 /// }
43 /// </CODE>
44 /// </summary>
45 sealed public class TermVectorsWriter
47 public const int FORMAT_VERSION = 1;
48 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
49 public const int FORMAT_SIZE = 4;
51 //TODO: Figure out how to write with or w/o position information and read back in
52 public const System.String TVX_EXTENSION = ".tvx";
53 public const System.String TVD_EXTENSION = ".tvd";
54 public const System.String TVF_EXTENSION = ".tvf";
55 private OutputStream tvx = null, tvd = null, tvf = null;
56 private System.Collections.ArrayList fields = null;
57 private System.Collections.ArrayList terms = null;
58 private FieldInfos fieldInfos;
60 private TVField currentField = null;
61 private long currentDocPointer = - 1;
63 /// <summary>Create term vectors writer for the specified segment in specified
64 /// directory. A new TermVectorsWriter should be created for each
65 /// segment. The parameter <code>maxFields</code> indicates how many total
66 /// fields are found in this document. Not all of these fields may require
67 /// termvectors to be stored, so the number of calls to
68 /// <code>openField</code> is less or equal to this number.
69 /// </summary>
70 public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos)
72 // Open files for TermVector storage
73 tvx = directory.CreateFile(segment + TVX_EXTENSION);
74 tvx.WriteInt(FORMAT_VERSION);
75 tvd = directory.CreateFile(segment + TVD_EXTENSION);
76 tvd.WriteInt(FORMAT_VERSION);
77 tvf = directory.CreateFile(segment + TVF_EXTENSION);
78 tvf.WriteInt(FORMAT_VERSION);
80 this.fieldInfos = fieldInfos;
81 fields = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(fieldInfos.Size()));
82 terms = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
86 public void OpenDocument()
88 CloseDocument();
90 currentDocPointer = tvd.GetFilePointer();
94 public void CloseDocument()
96 if (IsDocumentOpen())
98 CloseField();
99 WriteDoc();
100 fields.Clear();
101 currentDocPointer = - 1;
106 public bool IsDocumentOpen()
108 return currentDocPointer != - 1;
112 /// <summary>Start processing a Field. This can be followed by a number of calls to
113 /// addTerm, and a final call to closeField to indicate the end of
114 /// processing of this Field. If a Field was previously open, it is
115 /// closed automatically.
116 /// </summary>
117 public void OpenField(System.String field)
119 if (!IsDocumentOpen())
120 throw new System.SystemException("Cannot open Field when no document is open.");
122 CloseField();
123 currentField = new TVField(fieldInfos.FieldNumber(field));
126 /// <summary>Finished processing current Field. This should be followed by a call to
127 /// openField before future calls to addTerm.
128 /// </summary>
129 public void CloseField()
131 if (IsFieldOpen())
133 /* DEBUG */
134 //System.out.println("closeField()");
135 /* DEBUG */
137 // save Field and terms
138 WriteField();
139 fields.Add(currentField);
140 terms.Clear();
141 currentField = null;
145 /// <summary>Return true if a Field is currently open. </summary>
146 public bool IsFieldOpen()
148 return currentField != null;
151 /// <summary>Add term to the Field's term vector. Field must already be open
152 /// of NullPointerException is thrown. Terms should be added in
153 /// increasing order of terms, one call per unique termNum. ProxPointer
154 /// is a pointer into the TermPosition file (prx). Freq is the number of
155 /// times this term appears in this Field, in this document.
156 /// </summary>
157 public void AddTerm(System.String termText, int freq)
159 if (!IsDocumentOpen())
160 throw new System.SystemException("Cannot add terms when document is not open");
161 if (!IsFieldOpen())
162 throw new System.SystemException("Cannot add terms when Field is not open");
164 AddTermInternal(termText, freq);
167 private void AddTermInternal(System.String termText, int freq)
169 currentField.length += freq;
170 TVTerm term = new TVTerm();
171 term.termText = termText;
172 term.freq = freq;
173 terms.Add(term);
177 /// <summary>Add specified vectors to the document.</summary>
178 public void AddVectors(TermFreqVector[] vectors)
180 if (!IsDocumentOpen())
181 throw new System.SystemException("Cannot add term vectors when document is not open");
182 if (IsFieldOpen())
183 throw new System.SystemException("Cannot add term vectors when Field is open");
185 for (int i = 0; i < vectors.Length; i++)
187 AddTermFreqVector(vectors[i]);
192 /// <summary>Add specified vector to the document. Document must be open but no Field
193 /// should be open or exception is thrown. The same document can have <code>addTerm</code>
194 /// and <code>addVectors</code> calls mixed, however a given Field must either be
195 /// populated with <code>addTerm</code> or with <code>addVector</code>. *
196 /// </summary>
197 public void AddTermFreqVector(TermFreqVector vector)
199 if (!IsDocumentOpen())
200 throw new System.SystemException("Cannot add term vector when document is not open");
201 if (IsFieldOpen())
202 throw new System.SystemException("Cannot add term vector when Field is open");
203 AddTermFreqVectorInternal(vector);
206 private void AddTermFreqVectorInternal(TermFreqVector vector)
208 OpenField(vector.GetField());
209 for (int i = 0; i < vector.Size(); i++)
211 AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]);
213 CloseField();
219 /// <summary>Close all streams. </summary>
220 public /*internal*/ void Close()
224 CloseDocument();
226 finally
228 // make an effort to close all streams we can but remember and re-throw
229 // the first exception encountered in this process
230 System.IO.IOException keep = null;
231 if (tvx != null)
234 tvx.Close();
236 catch (System.IO.IOException e)
238 if (keep == null)
239 keep = e;
241 if (tvd != null)
244 tvd.Close();
246 catch (System.IO.IOException e)
248 if (keep == null)
249 keep = e;
251 if (tvf != null)
254 tvf.Close();
256 catch (System.IO.IOException e)
258 if (keep == null)
259 keep = e;
261 if (keep != null)
263 throw new System.IO.IOException(keep.StackTrace);
270 private void WriteField()
272 // remember where this Field is written
273 currentField.tvfPointer = tvf.GetFilePointer();
274 //System.out.println("Field Pointer: " + currentField.tvfPointer);
275 int size;
277 tvf.WriteVInt(size = terms.Count);
278 tvf.WriteVInt(currentField.length - size);
279 System.String lastTermText = "";
280 // write term ids and positions
281 for (int i = 0; i < size; i++)
283 TVTerm term = (TVTerm) terms[i];
284 //tvf.writeString(term.termText);
285 int start = StringHelper.StringDifference(lastTermText, term.termText);
286 int length = term.termText.Length - start;
287 tvf.WriteVInt(start); // write shared prefix length
288 tvf.WriteVInt(length); // write delta length
289 tvf.WriteChars(term.termText, start, length); // write delta chars
290 tvf.WriteVInt(term.freq);
291 lastTermText = term.termText;
298 private void WriteDoc()
300 if (IsFieldOpen())
301 throw new System.SystemException("Field is still open while writing document");
302 //System.out.println("Writing doc pointer: " + currentDocPointer);
303 // write document index record
304 tvx.WriteLong(currentDocPointer);
306 // write document data record
307 int size;
309 // write the number of fields
310 tvd.WriteVInt(size = fields.Count);
312 // write Field numbers
313 int lastFieldNumber = 0;
314 for (int i = 0; i < size; i++)
316 TVField field = (TVField) fields[i];
317 tvd.WriteVInt(field.number - lastFieldNumber);
319 lastFieldNumber = field.number;
322 // write Field pointers
323 long lastFieldPointer = 0;
324 for (int i = 0; i < size; i++)
326 TVField field = (TVField) fields[i];
327 tvd.WriteVLong(field.tvfPointer - lastFieldPointer);
329 lastFieldPointer = field.tvfPointer;
331 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
335 private class TVField
337 internal int number;
338 internal long tvfPointer = 0;
339 internal int length = 0; // number of distinct term positions
341 internal TVField(int number)
343 this.number = number;
347 private class TVTerm
349 internal System.String termText;
350 internal int freq = 0;
351 //int positions[] = null;