2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Directory
= Lucene
.Net
.Store
.Directory
;
18 using IndexInput
= Lucene
.Net
.Store
.IndexInput
;
19 namespace Lucene
.Net
.Index
22 /// <version> $Id: TermVectorsReader.cs,v 1.2 2005/10/06 19:29:56 dsd Exp $
24 class TermVectorsReader
: System
.ICloneable
26 private FieldInfos fieldInfos
;
28 private IndexInput tvx
;
29 private IndexInput tvd
;
30 private IndexInput tvf
;
33 private int tvdFormat
;
34 private int tvfFormat
;
36 public /*internal*/ TermVectorsReader(Directory d
, System
.String segment
, FieldInfos fieldInfos
)
38 if (d
.FileExists(segment
+ TermVectorsWriter
.TVX_EXTENSION
))
40 tvx
= d
.OpenInput(segment
+ TermVectorsWriter
.TVX_EXTENSION
);
41 CheckValidFormat(tvx
);
42 tvd
= d
.OpenInput(segment
+ TermVectorsWriter
.TVD_EXTENSION
);
43 tvdFormat
= CheckValidFormat(tvd
);
44 tvf
= d
.OpenInput(segment
+ TermVectorsWriter
.TVF_EXTENSION
);
45 tvfFormat
= CheckValidFormat(tvf
);
46 size
= (int) tvx
.Length() / 8;
49 this.fieldInfos
= fieldInfos
;
52 private int CheckValidFormat(IndexInput in_Renamed
)
54 int format
= in_Renamed
.ReadInt();
55 if (format
> TermVectorsWriter
.FORMAT_VERSION
)
57 throw new System
.IO
.IOException("Incompatible format version: " + format
+ " expected " + TermVectorsWriter
.FORMAT_VERSION
+ " or less");
62 internal virtual void Close()
64 // make all effort to close up. Keep the first exception
65 // and throw it as a new one.
66 System
.IO
.IOException keep
= null;
72 catch (System
.IO
.IOException e
)
82 catch (System
.IO
.IOException e
)
92 catch (System
.IO
.IOException e
)
99 throw new System
.IO
.IOException(keep
.StackTrace
);
103 /// <summary> </summary>
104 /// <returns> The number of documents in the reader
106 internal virtual int Size()
111 /// <summary> Retrieve the term vector for the given document and field</summary>
112 /// <param name="docNum">The document number to retrieve the vector for
114 /// <param name="field">The field within the document to retrieve
116 /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
118 /// <throws> IOException if there is an error reading the term vector files </throws>
119 public /*internal*/ virtual TermFreqVector
Get(int docNum
, System
.String field
)
121 // Check if no term vectors are available for this segment at all
122 int fieldNumber
= fieldInfos
.FieldNumber(field
);
123 TermFreqVector result
= null;
126 //We need to account for the FORMAT_SIZE at when seeking in the tvx
127 //We don't need to do this in other seeks because we already have the
129 //that was written in another file
130 tvx
.Seek((docNum
* 8L) + TermVectorsWriter
.FORMAT_SIZE
);
131 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
132 long position
= tvx
.ReadLong();
135 int fieldCount
= tvd
.ReadVInt();
136 //System.out.println("Num Fields: " + fieldCount);
137 // There are only a few fields per document. We opt for a full scan
138 // rather then requiring that they be ordered. We need to read through
139 // all of the fields anyway to get to the tvf pointers.
142 for (int i
= 0; i
< fieldCount
; i
++)
144 if (tvdFormat
== TermVectorsWriter
.FORMAT_VERSION
)
145 number
= tvd
.ReadVInt();
147 number
+= tvd
.ReadVInt();
149 if (number
== fieldNumber
)
153 // This field, although valid in the segment, was not found in this
157 // Compute position in the tvf file
159 for (int i
= 0; i
<= found
; i
++)
160 position
+= tvd
.ReadVLong();
162 result
= ReadTermVector(field
, position
);
166 //System.out.println("Field not found");
171 //System.out.println("No tvx file");
176 /// <summary> Return all term vectors stored for this document or null if the could not be read in.
179 /// <param name="docNum">The document number to retrieve the vector for
181 /// <returns> All term frequency vectors
183 /// <throws> IOException if there is an error reading the term vector files </throws>
184 internal virtual TermFreqVector
[] Get(int docNum
)
186 TermFreqVector
[] result
= null;
187 // Check if no term vectors are available for this segment at all
190 //We need to offset by
191 tvx
.Seek((docNum
* 8L) + TermVectorsWriter
.FORMAT_SIZE
);
192 long position
= tvx
.ReadLong();
195 int fieldCount
= tvd
.ReadVInt();
197 // No fields are vectorized for this document
201 System
.String
[] fields
= new System
.String
[fieldCount
];
203 for (int i
= 0; i
< fieldCount
; i
++)
205 if (tvdFormat
== TermVectorsWriter
.FORMAT_VERSION
)
206 number
= tvd
.ReadVInt();
208 number
+= tvd
.ReadVInt();
210 fields
[i
] = fieldInfos
.FieldName(number
);
213 // Compute position in the tvf file
215 long[] tvfPointers
= new long[fieldCount
];
216 for (int i
= 0; i
< fieldCount
; i
++)
218 position
+= tvd
.ReadVLong();
219 tvfPointers
[i
] = position
;
222 result
= ReadTermVectors(fields
, tvfPointers
);
227 //System.out.println("No tvx file");
233 private SegmentTermVector
[] ReadTermVectors(System
.String
[] fields
, long[] tvfPointers
)
235 SegmentTermVector
[] res
= new SegmentTermVector
[fields
.Length
];
236 for (int i
= 0; i
< fields
.Length
; i
++)
238 res
[i
] = ReadTermVector(fields
[i
], tvfPointers
[i
]);
243 /// <summary> </summary>
244 /// <param name="field">The field to read in
246 /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
248 /// <returns> The TermVector located at that position
250 /// <throws> IOException </throws>
251 private SegmentTermVector
ReadTermVector(System
.String field
, long tvfPointer
)
254 // Now read the data from specified position
255 //We don't need to offset by the FORMAT here since the pointer already includes the offset
256 tvf
.Seek(tvfPointer
);
258 int numTerms
= tvf
.ReadVInt();
259 //System.out.println("Num Terms: " + numTerms);
260 // If no terms - return a constant empty termvector. However, this should never occur!
262 return new SegmentTermVector(field
, null, null);
267 if (tvfFormat
== TermVectorsWriter
.FORMAT_VERSION
)
269 byte bits
= tvf
.ReadByte();
270 storePositions
= (bits
& TermVectorsWriter
.STORE_POSITIONS_WITH_TERMVECTOR
) != 0;
271 storeOffsets
= (bits
& TermVectorsWriter
.STORE_OFFSET_WITH_TERMVECTOR
) != 0;
276 storePositions
= false;
277 storeOffsets
= false;
280 System
.String
[] terms
= new System
.String
[numTerms
];
281 int[] termFreqs
= new int[numTerms
];
283 // we may not need these, but declare them
284 int[][] positions
= null;
285 TermVectorOffsetInfo
[][] offsets
= null;
287 positions
= new int[numTerms
][];
289 offsets
= new TermVectorOffsetInfo
[numTerms
][];
294 char[] buffer
= new char[]{};
295 System
.String previousString
= "";
297 for (int i
= 0; i
< numTerms
; i
++)
299 start
= tvf
.ReadVInt();
300 deltaLength
= tvf
.ReadVInt();
301 totalLength
= start
+ deltaLength
;
302 if (buffer
.Length
< totalLength
)
304 buffer
= new char[totalLength
];
305 for (int j
= 0; j
< previousString
.Length
; j
++)
307 buffer
[j
] = previousString
[j
];
309 tvf
.ReadChars(buffer
, start
, deltaLength
);
310 terms
[i
] = new System
.String(buffer
, 0, totalLength
);
311 previousString
= terms
[i
];
312 int freq
= tvf
.ReadVInt();
317 //read in the positions
318 int[] pos
= new int[freq
];
320 int prevPosition
= 0;
321 for (int j
= 0; j
< freq
; j
++)
323 pos
[j
] = prevPosition
+ tvf
.ReadVInt();
324 prevPosition
= pos
[j
];
330 TermVectorOffsetInfo
[] offs
= new TermVectorOffsetInfo
[freq
];
333 for (int j
= 0; j
< freq
; j
++)
335 int startOffset
= prevOffset
+ tvf
.ReadVInt();
336 int endOffset
= startOffset
+ tvf
.ReadVInt();
337 offs
[j
] = new TermVectorOffsetInfo(startOffset
, endOffset
);
338 prevOffset
= endOffset
;
343 SegmentTermVector tv
;
344 if (storePositions
|| storeOffsets
)
346 tv
= new SegmentTermPositionVector(field
, terms
, termFreqs
, positions
, offsets
);
350 tv
= new SegmentTermVector(field
, terms
, termFreqs
);
355 public virtual System
.Object
Clone()
358 if (tvx
== null || tvd
== null || tvf
== null)
361 TermVectorsReader clone
= null;
364 clone
= (TermVectorsReader
) base.MemberwiseClone();
366 catch (System
.Exception e
)
370 clone
.tvx
= (IndexInput
) tvx
.Clone();
371 clone
.tvd
= (IndexInput
) tvd
.Clone();
372 clone
.tvf
= (IndexInput
) tvf
.Clone();