Update the thread-local storage patch, to fix #335178
[beagle.git] / beagled / Lucene.Net / Index / TermVectorsReader.cs
blob0c7510687511c14041b54c8f10e94b3e78b3ebab
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Directory = Lucene.Net.Store.Directory;
18 using IndexInput = Lucene.Net.Store.IndexInput;
19 namespace Lucene.Net.Index
22 /// <version> $Id: TermVectorsReader.cs,v 1.2 2005/10/06 19:29:56 dsd Exp $
23 /// </version>
24 class TermVectorsReader : System.ICloneable
26 private FieldInfos fieldInfos;
28 private IndexInput tvx;
29 private IndexInput tvd;
30 private IndexInput tvf;
31 private int size;
33 private int tvdFormat;
34 private int tvfFormat;
36 public /*internal*/ TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos)
38 if (d.FileExists(segment + TermVectorsWriter.TVX_EXTENSION))
40 tvx = d.OpenInput(segment + TermVectorsWriter.TVX_EXTENSION);
41 CheckValidFormat(tvx);
42 tvd = d.OpenInput(segment + TermVectorsWriter.TVD_EXTENSION);
43 tvdFormat = CheckValidFormat(tvd);
44 tvf = d.OpenInput(segment + TermVectorsWriter.TVF_EXTENSION);
45 tvfFormat = CheckValidFormat(tvf);
46 size = (int) tvx.Length() / 8;
49 this.fieldInfos = fieldInfos;
52 private int CheckValidFormat(IndexInput in_Renamed)
54 int format = in_Renamed.ReadInt();
55 if (format > TermVectorsWriter.FORMAT_VERSION)
57 throw new System.IO.IOException("Incompatible format version: " + format + " expected " + TermVectorsWriter.FORMAT_VERSION + " or less");
59 return format;
62 internal virtual void Close()
64 // make all effort to close up. Keep the first exception
65 // and throw it as a new one.
66 System.IO.IOException keep = null;
67 if (tvx != null)
68 try
70 tvx.Close();
72 catch (System.IO.IOException e)
74 if (keep == null)
75 keep = e;
77 if (tvd != null)
78 try
80 tvd.Close();
82 catch (System.IO.IOException e)
84 if (keep == null)
85 keep = e;
87 if (tvf != null)
88 try
90 tvf.Close();
92 catch (System.IO.IOException e)
94 if (keep == null)
95 keep = e;
97 if (keep != null)
99 throw new System.IO.IOException(keep.StackTrace);
103 /// <summary> </summary>
104 /// <returns> The number of documents in the reader
105 /// </returns>
106 internal virtual int Size()
108 return size;
111 /// <summary> Retrieve the term vector for the given document and field</summary>
112 /// <param name="docNum">The document number to retrieve the vector for
113 /// </param>
114 /// <param name="field">The field within the document to retrieve
115 /// </param>
116 /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
117 /// </returns>
118 /// <throws> IOException if there is an error reading the term vector files </throws>
119 public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field)
121 // Check if no term vectors are available for this segment at all
122 int fieldNumber = fieldInfos.FieldNumber(field);
123 TermFreqVector result = null;
124 if (tvx != null)
126 //We need to account for the FORMAT_SIZE at when seeking in the tvx
127 //We don't need to do this in other seeks because we already have the
128 // file pointer
129 //that was written in another file
130 tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
131 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
132 long position = tvx.ReadLong();
134 tvd.Seek(position);
135 int fieldCount = tvd.ReadVInt();
136 //System.out.println("Num Fields: " + fieldCount);
137 // There are only a few fields per document. We opt for a full scan
138 // rather then requiring that they be ordered. We need to read through
139 // all of the fields anyway to get to the tvf pointers.
140 int number = 0;
141 int found = - 1;
142 for (int i = 0; i < fieldCount; i++)
144 if (tvdFormat == TermVectorsWriter.FORMAT_VERSION)
145 number = tvd.ReadVInt();
146 else
147 number += tvd.ReadVInt();
149 if (number == fieldNumber)
150 found = i;
153 // This field, although valid in the segment, was not found in this
154 // document
155 if (found != - 1)
157 // Compute position in the tvf file
158 position = 0;
159 for (int i = 0; i <= found; i++)
160 position += tvd.ReadVLong();
162 result = ReadTermVector(field, position);
164 else
166 //System.out.println("Field not found");
169 else
171 //System.out.println("No tvx file");
173 return result;
176 /// <summary> Return all term vectors stored for this document or null if the could not be read in.
177 ///
178 /// </summary>
179 /// <param name="docNum">The document number to retrieve the vector for
180 /// </param>
181 /// <returns> All term frequency vectors
182 /// </returns>
183 /// <throws> IOException if there is an error reading the term vector files </throws>
184 internal virtual TermFreqVector[] Get(int docNum)
186 TermFreqVector[] result = null;
187 // Check if no term vectors are available for this segment at all
188 if (tvx != null)
190 //We need to offset by
191 tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
192 long position = tvx.ReadLong();
194 tvd.Seek(position);
195 int fieldCount = tvd.ReadVInt();
197 // No fields are vectorized for this document
198 if (fieldCount != 0)
200 int number = 0;
201 System.String[] fields = new System.String[fieldCount];
203 for (int i = 0; i < fieldCount; i++)
205 if (tvdFormat == TermVectorsWriter.FORMAT_VERSION)
206 number = tvd.ReadVInt();
207 else
208 number += tvd.ReadVInt();
210 fields[i] = fieldInfos.FieldName(number);
213 // Compute position in the tvf file
214 position = 0;
215 long[] tvfPointers = new long[fieldCount];
216 for (int i = 0; i < fieldCount; i++)
218 position += tvd.ReadVLong();
219 tvfPointers[i] = position;
222 result = ReadTermVectors(fields, tvfPointers);
225 else
227 //System.out.println("No tvx file");
229 return result;
233 private SegmentTermVector[] ReadTermVectors(System.String[] fields, long[] tvfPointers)
235 SegmentTermVector[] res = new SegmentTermVector[fields.Length];
236 for (int i = 0; i < fields.Length; i++)
238 res[i] = ReadTermVector(fields[i], tvfPointers[i]);
240 return res;
243 /// <summary> </summary>
244 /// <param name="field">The field to read in
245 /// </param>
246 /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
247 /// </param>
248 /// <returns> The TermVector located at that position
249 /// </returns>
250 /// <throws> IOException </throws>
251 private SegmentTermVector ReadTermVector(System.String field, long tvfPointer)
254 // Now read the data from specified position
255 //We don't need to offset by the FORMAT here since the pointer already includes the offset
256 tvf.Seek(tvfPointer);
258 int numTerms = tvf.ReadVInt();
259 //System.out.println("Num Terms: " + numTerms);
260 // If no terms - return a constant empty termvector. However, this should never occur!
261 if (numTerms == 0)
262 return new SegmentTermVector(field, null, null);
264 bool storePositions;
265 bool storeOffsets;
267 if (tvfFormat == TermVectorsWriter.FORMAT_VERSION)
269 byte bits = tvf.ReadByte();
270 storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
271 storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
273 else
275 tvf.ReadVInt();
276 storePositions = false;
277 storeOffsets = false;
280 System.String[] terms = new System.String[numTerms];
281 int[] termFreqs = new int[numTerms];
283 // we may not need these, but declare them
284 int[][] positions = null;
285 TermVectorOffsetInfo[][] offsets = null;
286 if (storePositions)
287 positions = new int[numTerms][];
288 if (storeOffsets)
289 offsets = new TermVectorOffsetInfo[numTerms][];
291 int start = 0;
292 int deltaLength = 0;
293 int totalLength = 0;
294 char[] buffer = new char[]{};
295 System.String previousString = "";
297 for (int i = 0; i < numTerms; i++)
299 start = tvf.ReadVInt();
300 deltaLength = tvf.ReadVInt();
301 totalLength = start + deltaLength;
302 if (buffer.Length < totalLength)
304 buffer = new char[totalLength];
305 for (int j = 0; j < previousString.Length; j++)
306 // copy contents
307 buffer[j] = previousString[j];
309 tvf.ReadChars(buffer, start, deltaLength);
310 terms[i] = new System.String(buffer, 0, totalLength);
311 previousString = terms[i];
312 int freq = tvf.ReadVInt();
313 termFreqs[i] = freq;
315 if (storePositions)
317 //read in the positions
318 int[] pos = new int[freq];
319 positions[i] = pos;
320 int prevPosition = 0;
321 for (int j = 0; j < freq; j++)
323 pos[j] = prevPosition + tvf.ReadVInt();
324 prevPosition = pos[j];
328 if (storeOffsets)
330 TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
331 offsets[i] = offs;
332 int prevOffset = 0;
333 for (int j = 0; j < freq; j++)
335 int startOffset = prevOffset + tvf.ReadVInt();
336 int endOffset = startOffset + tvf.ReadVInt();
337 offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
338 prevOffset = endOffset;
343 SegmentTermVector tv;
344 if (storePositions || storeOffsets)
346 tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
348 else
350 tv = new SegmentTermVector(field, terms, termFreqs);
352 return tv;
355 public virtual System.Object Clone()
358 if (tvx == null || tvd == null || tvf == null)
359 return null;
361 TermVectorsReader clone = null;
364 clone = (TermVectorsReader) base.MemberwiseClone();
366 catch (System.Exception e)
370 clone.tvx = (IndexInput) tvx.Clone();
371 clone.tvd = (IndexInput) tvd.Clone();
372 clone.tvf = (IndexInput) tvf.Clone();
374 return clone;