2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 using Directory
= Lucene
.Net
.Store
.Directory
;
19 using IndexInput
= Lucene
.Net
.Store
.IndexInput
;
21 namespace Lucene
.Net
.Index
24 /// <version> $Id: TermVectorsReader.cs,v 1.3 2006/10/02 17:09:00 joeshaw Exp $
26 public class TermVectorsReader
: System
.ICloneable
28 private FieldInfos fieldInfos
;
30 private IndexInput tvx
;
31 private IndexInput tvd
;
32 private IndexInput tvf
;
35 private int tvdFormat
;
36 private int tvfFormat
;
38 public /*internal*/ TermVectorsReader(Directory d
, System
.String segment
, FieldInfos fieldInfos
)
40 if (d
.FileExists(segment
+ TermVectorsWriter
.TVX_EXTENSION
))
42 tvx
= d
.OpenInput(segment
+ TermVectorsWriter
.TVX_EXTENSION
);
43 CheckValidFormat(tvx
);
44 tvd
= d
.OpenInput(segment
+ TermVectorsWriter
.TVD_EXTENSION
);
45 tvdFormat
= CheckValidFormat(tvd
);
46 tvf
= d
.OpenInput(segment
+ TermVectorsWriter
.TVF_EXTENSION
);
47 tvfFormat
= CheckValidFormat(tvf
);
48 size
= (int) tvx
.Length() / 8;
51 this.fieldInfos
= fieldInfos
;
54 private int CheckValidFormat(IndexInput in_Renamed
)
56 int format
= in_Renamed
.ReadInt();
57 if (format
> TermVectorsWriter
.FORMAT_VERSION
)
59 throw new System
.IO
.IOException("Incompatible format version: " + format
+ " expected " + TermVectorsWriter
.FORMAT_VERSION
+ " or less");
64 internal virtual void Close()
66 // make all effort to close up. Keep the first exception
67 // and throw it as a new one.
68 System
.IO
.IOException keep
= null;
74 catch (System
.IO
.IOException e
)
84 catch (System
.IO
.IOException e
)
94 catch (System
.IO
.IOException e
)
101 throw new System
.IO
.IOException(keep
.StackTrace
);
105 /// <summary> </summary>
106 /// <returns> The number of documents in the reader
108 internal virtual int Size()
113 /// <summary> Retrieve the term vector for the given document and field</summary>
114 /// <param name="docNum">The document number to retrieve the vector for
116 /// <param name="field">The field within the document to retrieve
118 /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
120 /// <throws> IOException if there is an error reading the term vector files </throws>
121 public /*internal*/ virtual TermFreqVector
Get(int docNum
, System
.String field
)
123 // Check if no term vectors are available for this segment at all
124 int fieldNumber
= fieldInfos
.FieldNumber(field
);
125 TermFreqVector result
= null;
128 //We need to account for the FORMAT_SIZE at when seeking in the tvx
129 //We don't need to do this in other seeks because we already have the
131 //that was written in another file
132 tvx
.Seek((docNum
* 8L) + TermVectorsWriter
.FORMAT_SIZE
);
133 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
134 long position
= tvx
.ReadLong();
137 int fieldCount
= tvd
.ReadVInt();
138 //System.out.println("Num Fields: " + fieldCount);
139 // There are only a few fields per document. We opt for a full scan
140 // rather then requiring that they be ordered. We need to read through
141 // all of the fields anyway to get to the tvf pointers.
144 for (int i
= 0; i
< fieldCount
; i
++)
146 if (tvdFormat
== TermVectorsWriter
.FORMAT_VERSION
)
147 number
= tvd
.ReadVInt();
149 number
+= tvd
.ReadVInt();
151 if (number
== fieldNumber
)
155 // This field, although valid in the segment, was not found in this
159 // Compute position in the tvf file
161 for (int i
= 0; i
<= found
; i
++)
162 position
+= tvd
.ReadVLong();
164 result
= ReadTermVector(field
, position
);
168 //System.out.println("Field not found");
173 //System.out.println("No tvx file");
178 /// <summary> Return all term vectors stored for this document or null if the could not be read in.
181 /// <param name="docNum">The document number to retrieve the vector for
183 /// <returns> All term frequency vectors
185 /// <throws> IOException if there is an error reading the term vector files </throws>
186 public /*internal*/ virtual TermFreqVector
[] Get(int docNum
)
188 TermFreqVector
[] result
= null;
189 // Check if no term vectors are available for this segment at all
192 //We need to offset by
193 tvx
.Seek((docNum
* 8L) + TermVectorsWriter
.FORMAT_SIZE
);
194 long position
= tvx
.ReadLong();
197 int fieldCount
= tvd
.ReadVInt();
199 // No fields are vectorized for this document
203 System
.String
[] fields
= new System
.String
[fieldCount
];
205 for (int i
= 0; i
< fieldCount
; i
++)
207 if (tvdFormat
== TermVectorsWriter
.FORMAT_VERSION
)
208 number
= tvd
.ReadVInt();
210 number
+= tvd
.ReadVInt();
212 fields
[i
] = fieldInfos
.FieldName(number
);
215 // Compute position in the tvf file
217 long[] tvfPointers
= new long[fieldCount
];
218 for (int i
= 0; i
< fieldCount
; i
++)
220 position
+= tvd
.ReadVLong();
221 tvfPointers
[i
] = position
;
224 result
= ReadTermVectors(fields
, tvfPointers
);
229 //System.out.println("No tvx file");
235 private SegmentTermVector
[] ReadTermVectors(System
.String
[] fields
, long[] tvfPointers
)
237 SegmentTermVector
[] res
= new SegmentTermVector
[fields
.Length
];
238 for (int i
= 0; i
< fields
.Length
; i
++)
240 res
[i
] = ReadTermVector(fields
[i
], tvfPointers
[i
]);
245 /// <summary> </summary>
246 /// <param name="field">The field to read in
248 /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
250 /// <returns> The TermVector located at that position
252 /// <throws> IOException </throws>
253 private SegmentTermVector
ReadTermVector(System
.String field
, long tvfPointer
)
256 // Now read the data from specified position
257 //We don't need to offset by the FORMAT here since the pointer already includes the offset
258 tvf
.Seek(tvfPointer
);
260 int numTerms
= tvf
.ReadVInt();
261 //System.out.println("Num Terms: " + numTerms);
262 // If no terms - return a constant empty termvector. However, this should never occur!
264 return new SegmentTermVector(field
, null, null);
269 if (tvfFormat
== TermVectorsWriter
.FORMAT_VERSION
)
271 byte bits
= tvf
.ReadByte();
272 storePositions
= (bits
& TermVectorsWriter
.STORE_POSITIONS_WITH_TERMVECTOR
) != 0;
273 storeOffsets
= (bits
& TermVectorsWriter
.STORE_OFFSET_WITH_TERMVECTOR
) != 0;
278 storePositions
= false;
279 storeOffsets
= false;
282 System
.String
[] terms
= new System
.String
[numTerms
];
283 int[] termFreqs
= new int[numTerms
];
285 // we may not need these, but declare them
286 int[][] positions
= null;
287 TermVectorOffsetInfo
[][] offsets
= null;
289 positions
= new int[numTerms
][];
291 offsets
= new TermVectorOffsetInfo
[numTerms
][];
296 char[] buffer
= new char[10]; // init the buffer with a length of 10 character
297 char[] previousBuffer
= new char[]{};
299 for (int i
= 0; i
< numTerms
; i
++)
301 start
= tvf
.ReadVInt();
302 deltaLength
= tvf
.ReadVInt();
303 totalLength
= start
+ deltaLength
;
304 if (buffer
.Length
< totalLength
)
307 buffer
= null; // give a hint to garbage collector
308 buffer
= new char[totalLength
];
311 // just copy if necessary
312 Array
.Copy(previousBuffer
, 0, buffer
, 0, start
);
315 tvf
.ReadChars(buffer
, start
, deltaLength
);
316 terms
[i
] = new System
.String(buffer
, 0, totalLength
);
317 previousBuffer
= buffer
;
318 int freq
= tvf
.ReadVInt();
323 //read in the positions
324 int[] pos
= new int[freq
];
326 int prevPosition
= 0;
327 for (int j
= 0; j
< freq
; j
++)
329 pos
[j
] = prevPosition
+ tvf
.ReadVInt();
330 prevPosition
= pos
[j
];
336 TermVectorOffsetInfo
[] offs
= new TermVectorOffsetInfo
[freq
];
339 for (int j
= 0; j
< freq
; j
++)
341 int startOffset
= prevOffset
+ tvf
.ReadVInt();
342 int endOffset
= startOffset
+ tvf
.ReadVInt();
343 offs
[j
] = new TermVectorOffsetInfo(startOffset
, endOffset
);
344 prevOffset
= endOffset
;
349 SegmentTermVector tv
;
350 if (storePositions
|| storeOffsets
)
352 tv
= new SegmentTermPositionVector(field
, terms
, termFreqs
, positions
, offsets
);
356 tv
= new SegmentTermVector(field
, terms
, termFreqs
);
361 public virtual System
.Object
Clone()
364 if (tvx
== null || tvd
== null || tvf
== null)
367 TermVectorsReader clone
= null;
370 clone
= (TermVectorsReader
) base.MemberwiseClone();
372 catch (System
.Exception
)
376 clone
.tvx
= (IndexInput
) tvx
.Clone();
377 clone
.tvd
= (IndexInput
) tvd
.Clone();
378 clone
.tvf
= (IndexInput
) tvf
.Clone();