cvsimport
[beagle.git] / beagled / Lucene.Net / Index / TermVectorsReader.cs
blob2aa4d30d3bfc53db2bf389ce50d4af019a513cc9
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Directory = Lucene.Net.Store.Directory;
19 using IndexInput = Lucene.Net.Store.IndexInput;
21 namespace Lucene.Net.Index
24 /// <version> $Id: TermVectorsReader.cs,v 1.3 2006/10/02 17:09:00 joeshaw Exp $
25 /// </version>
26 public class TermVectorsReader : System.ICloneable
28 private FieldInfos fieldInfos;
30 private IndexInput tvx;
31 private IndexInput tvd;
32 private IndexInput tvf;
33 private int size;
35 private int tvdFormat;
36 private int tvfFormat;
38 public /*internal*/ TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos)
40 if (d.FileExists(segment + TermVectorsWriter.TVX_EXTENSION))
42 tvx = d.OpenInput(segment + TermVectorsWriter.TVX_EXTENSION);
43 CheckValidFormat(tvx);
44 tvd = d.OpenInput(segment + TermVectorsWriter.TVD_EXTENSION);
45 tvdFormat = CheckValidFormat(tvd);
46 tvf = d.OpenInput(segment + TermVectorsWriter.TVF_EXTENSION);
47 tvfFormat = CheckValidFormat(tvf);
48 size = (int) tvx.Length() / 8;
51 this.fieldInfos = fieldInfos;
54 private int CheckValidFormat(IndexInput in_Renamed)
56 int format = in_Renamed.ReadInt();
57 if (format > TermVectorsWriter.FORMAT_VERSION)
59 throw new System.IO.IOException("Incompatible format version: " + format + " expected " + TermVectorsWriter.FORMAT_VERSION + " or less");
61 return format;
64 internal virtual void Close()
66 // make all effort to close up. Keep the first exception
67 // and throw it as a new one.
68 System.IO.IOException keep = null;
69 if (tvx != null)
70 try
72 tvx.Close();
74 catch (System.IO.IOException e)
76 if (keep == null)
77 keep = e;
79 if (tvd != null)
80 try
82 tvd.Close();
84 catch (System.IO.IOException e)
86 if (keep == null)
87 keep = e;
89 if (tvf != null)
90 try
92 tvf.Close();
94 catch (System.IO.IOException e)
96 if (keep == null)
97 keep = e;
99 if (keep != null)
101 throw new System.IO.IOException(keep.StackTrace);
105 /// <summary> </summary>
106 /// <returns> The number of documents in the reader
107 /// </returns>
108 internal virtual int Size()
110 return size;
113 /// <summary> Retrieve the term vector for the given document and field</summary>
114 /// <param name="docNum">The document number to retrieve the vector for
115 /// </param>
116 /// <param name="field">The field within the document to retrieve
117 /// </param>
118 /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
119 /// </returns>
120 /// <throws> IOException if there is an error reading the term vector files </throws>
121 public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field)
123 // Check if no term vectors are available for this segment at all
124 int fieldNumber = fieldInfos.FieldNumber(field);
125 TermFreqVector result = null;
126 if (tvx != null)
128 //We need to account for the FORMAT_SIZE at when seeking in the tvx
129 //We don't need to do this in other seeks because we already have the
130 // file pointer
131 //that was written in another file
132 tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
133 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
134 long position = tvx.ReadLong();
136 tvd.Seek(position);
137 int fieldCount = tvd.ReadVInt();
138 //System.out.println("Num Fields: " + fieldCount);
139 // There are only a few fields per document. We opt for a full scan
140 // rather then requiring that they be ordered. We need to read through
141 // all of the fields anyway to get to the tvf pointers.
142 int number = 0;
143 int found = - 1;
144 for (int i = 0; i < fieldCount; i++)
146 if (tvdFormat == TermVectorsWriter.FORMAT_VERSION)
147 number = tvd.ReadVInt();
148 else
149 number += tvd.ReadVInt();
151 if (number == fieldNumber)
152 found = i;
155 // This field, although valid in the segment, was not found in this
156 // document
157 if (found != - 1)
159 // Compute position in the tvf file
160 position = 0;
161 for (int i = 0; i <= found; i++)
162 position += tvd.ReadVLong();
164 result = ReadTermVector(field, position);
166 else
168 //System.out.println("Field not found");
171 else
173 //System.out.println("No tvx file");
175 return result;
178 /// <summary> Return all term vectors stored for this document or null if the could not be read in.
179 ///
180 /// </summary>
181 /// <param name="docNum">The document number to retrieve the vector for
182 /// </param>
183 /// <returns> All term frequency vectors
184 /// </returns>
185 /// <throws> IOException if there is an error reading the term vector files </throws>
186 public /*internal*/ virtual TermFreqVector[] Get(int docNum)
188 TermFreqVector[] result = null;
189 // Check if no term vectors are available for this segment at all
190 if (tvx != null)
192 //We need to offset by
193 tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
194 long position = tvx.ReadLong();
196 tvd.Seek(position);
197 int fieldCount = tvd.ReadVInt();
199 // No fields are vectorized for this document
200 if (fieldCount != 0)
202 int number = 0;
203 System.String[] fields = new System.String[fieldCount];
205 for (int i = 0; i < fieldCount; i++)
207 if (tvdFormat == TermVectorsWriter.FORMAT_VERSION)
208 number = tvd.ReadVInt();
209 else
210 number += tvd.ReadVInt();
212 fields[i] = fieldInfos.FieldName(number);
215 // Compute position in the tvf file
216 position = 0;
217 long[] tvfPointers = new long[fieldCount];
218 for (int i = 0; i < fieldCount; i++)
220 position += tvd.ReadVLong();
221 tvfPointers[i] = position;
224 result = ReadTermVectors(fields, tvfPointers);
227 else
229 //System.out.println("No tvx file");
231 return result;
235 private SegmentTermVector[] ReadTermVectors(System.String[] fields, long[] tvfPointers)
237 SegmentTermVector[] res = new SegmentTermVector[fields.Length];
238 for (int i = 0; i < fields.Length; i++)
240 res[i] = ReadTermVector(fields[i], tvfPointers[i]);
242 return res;
245 /// <summary> </summary>
246 /// <param name="field">The field to read in
247 /// </param>
248 /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
249 /// </param>
250 /// <returns> The TermVector located at that position
251 /// </returns>
252 /// <throws> IOException </throws>
253 private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
256 // Now read the data from specified position
257 //We don't need to offset by the FORMAT here since the pointer already includes the offset
258 tvf.Seek(tvfPointer);
260 int numTerms = tvf.ReadVInt();
261 //System.out.println("Num Terms: " + numTerms);
262 // If no terms - return a constant empty termvector. However, this should never occur!
263 if (numTerms == 0)
264 return new SegmentTermVector(field, null, null);
266 bool storePositions;
267 bool storeOffsets;
269 if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
271 byte bits = tvf.ReadByte();
272 storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
273 storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
275 else
277 tvf.ReadVInt();
278 storePositions = false;
279 storeOffsets = false;
282 System.String[] terms = new System.String[numTerms];
283 int[] termFreqs = new int[numTerms];
285 // we may not need these, but declare them
286 int[][] positions = null;
287 TermVectorOffsetInfo[][] offsets = null;
288 if (storePositions)
289 positions = new int[numTerms][];
290 if (storeOffsets)
291 offsets = new TermVectorOffsetInfo[numTerms][];
293 int start = 0;
294 int deltaLength = 0;
295 int totalLength = 0;
296 char[] buffer = new char[10]; // init the buffer with a length of 10 character
297 char[] previousBuffer = new char[]{};
299 for (int i = 0; i < numTerms; i++)
301 start = tvf.ReadVInt();
302 deltaLength = tvf.ReadVInt();
303 totalLength = start + deltaLength;
304 if (buffer.Length < totalLength)
306 // increase buffer
307 buffer = null; // give a hint to garbage collector
308 buffer = new char[totalLength];
310 if (start > 0)
311 // just copy if necessary
312 Array.Copy(previousBuffer, 0, buffer, 0, start);
315 tvf.ReadChars(buffer, start, deltaLength);
316 terms[i] = new System.String(buffer, 0, totalLength);
317 previousBuffer = buffer;
318 int freq = tvf.ReadVInt();
319 termFreqs[i] = freq;
321 if (storePositions)
323 //read in the positions
324 int[] pos = new int[freq];
325 positions[i] = pos;
326 int prevPosition = 0;
327 for (int j = 0; j < freq; j++)
329 pos[j] = prevPosition + tvf.ReadVInt();
330 prevPosition = pos[j];
334 if (storeOffsets)
336 TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
337 offsets[i] = offs;
338 int prevOffset = 0;
339 for (int j = 0; j < freq; j++)
341 int startOffset = prevOffset + tvf.ReadVInt();
342 int endOffset = startOffset + tvf.ReadVInt();
343 offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
344 prevOffset = endOffset;
349 SegmentTermVector tv;
350 if (storePositions || storeOffsets)
352 tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
354 else
356 tv = new SegmentTermVector(field, terms, termFreqs);
358 return tv;
361 public virtual System.Object Clone()
364 if (tvx == null || tvd == null || tvf == null)
365 return null;
367 TermVectorsReader clone = null;
370 clone = (TermVectorsReader) base.MemberwiseClone();
372 catch (System.Exception)
376 clone.tvx = (IndexInput) tvx.Clone();
377 clone.tvd = (IndexInput) tvd.Clone();
378 clone.tvf = (IndexInput) tvf.Clone();
380 return clone;