2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Analyzer
= Lucene
.Net
.Analysis
.Analyzer
;
18 using Token
= Lucene
.Net
.Analysis
.Token
;
19 using TokenStream
= Lucene
.Net
.Analysis
.TokenStream
;
20 using Document
= Lucene
.Net
.Documents
.Document
;
21 using Field
= Lucene
.Net
.Documents
.Field
;
22 using Similarity
= Lucene
.Net
.Search
.Similarity
;
23 using Directory
= Lucene
.Net
.Store
.Directory
;
24 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
25 namespace Lucene
.Net
.Index
28 sealed public class DocumentWriter
30 private void InitBlock()
32 termIndexInterval
= IndexWriter
.DEFAULT_TERM_INDEX_INTERVAL
;
34 private Analyzer analyzer
;
35 private Directory directory
;
36 private Similarity similarity
;
37 private FieldInfos fieldInfos
;
38 private int maxFieldLength
;
39 private int termIndexInterval
;
40 private System
.IO
.TextWriter infoStream
;
42 /// <summary>This ctor used by test code only.
45 /// <param name="directory">The directory to write the document information to
47 /// <param name="analyzer">The analyzer to use for the document
49 /// <param name="similarity">The Similarity function
51 /// <param name="maxFieldLength">The maximum number of tokens a field may have
53 internal DocumentWriter(Directory directory
, Analyzer analyzer
, Similarity similarity
, int maxFieldLength
)
56 this.directory
= directory
;
57 this.analyzer
= analyzer
;
58 this.similarity
= similarity
;
59 this.maxFieldLength
= maxFieldLength
;
62 internal DocumentWriter(Directory directory
, Analyzer analyzer
, IndexWriter writer
)
65 this.directory
= directory
;
66 this.analyzer
= analyzer
;
67 this.similarity
= writer
.GetSimilarity();
68 this.maxFieldLength
= writer
.GetMaxFieldLength();
69 this.termIndexInterval
= writer
.GetTermIndexInterval();
72 /*internal*/ public void AddDocument(System
.String segment
, Document doc
)
75 fieldInfos
= new FieldInfos();
77 fieldInfos
.Write(directory
, segment
+ ".fnm");
80 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
83 fieldsWriter
.AddDocument(doc
);
90 // invert doc into postingTable
91 postingTable
.Clear(); // clear postingTable
92 fieldLengths
= new int[fieldInfos
.Size()]; // init fieldLengths
93 fieldPositions
= new int[fieldInfos
.Size()]; // init fieldPositions
94 fieldOffsets
= new int[fieldInfos
.Size()]; // init fieldOffsets
96 fieldBoosts
= new float[fieldInfos
.Size()]; // init fieldBoosts
97 float boost
= doc
.GetBoost();
98 for (int i
= 0; i
< fieldBoosts
.Length
; i
++)
100 fieldBoosts
[i
] = boost
;
105 // sort postingTable into an array
106 Posting
[] postings
= SortPostingTable();
109 for (int i = 0; i < postings.length; i++) {
110 Posting posting = postings[i];
111 System.out.print(posting.term);
112 System.out.print(" freq=" + posting.freq);
113 System.out.print(" pos=");
114 System.out.print(posting.positions[0]);
115 for (int j = 1; j < posting.freq; j++)
116 System.out.print("," + posting.positions[j]);
117 System.out.println("");
122 WritePostings(postings
, segment
);
124 // write norms of indexed fields
128 // Keys are Terms, values are Postings.
129 // Used to buffer a document before it is written to the index.
130 private System
.Collections
.Hashtable postingTable
= System
.Collections
.Hashtable
.Synchronized(new System
.Collections
.Hashtable());
131 private int[] fieldLengths
;
132 private int[] fieldPositions
;
133 private int[] fieldOffsets
;
134 private float[] fieldBoosts
;
136 // Tokenizes the fields of a document into Postings.
137 private void InvertDocument(Document doc
)
139 foreach(Field field
in doc
.Fields())
141 System
.String fieldName
= field
.Name();
142 int fieldNumber
= fieldInfos
.FieldNumber(fieldName
);
144 int length
= fieldLengths
[fieldNumber
]; // length of field
145 int position
= fieldPositions
[fieldNumber
]; // position in field
146 int offset
= fieldOffsets
[fieldNumber
]; // offset field
148 if (field
.IsIndexed())
150 if (!field
.IsTokenized())
152 // un-tokenized field
153 System
.String stringValue
= field
.StringValue();
154 if (field
.IsStoreOffsetWithTermVector())
155 AddPosition(fieldName
, stringValue
, position
++, new TermVectorOffsetInfo(offset
, offset
+ stringValue
.Length
));
157 AddPosition(fieldName
, stringValue
, position
++, null);
158 offset
+= stringValue
.Length
;
163 System
.IO
.TextReader reader
; // find or make Reader
164 if (field
.ReaderValue() != null)
165 reader
= field
.ReaderValue();
166 else if (field
.StringValue() != null)
167 reader
= new System
.IO
.StringReader(field
.StringValue());
169 throw new System
.ArgumentException("field must have either String or Reader value");
171 // Tokenize field and add to postingTable
172 TokenStream stream
= analyzer
.TokenStream(fieldName
, reader
);
175 Token lastToken
= null;
176 for (Token t
= stream
.Next(); t
!= null; t
= stream
.Next())
178 position
+= (t
.GetPositionIncrement() - 1);
180 if (field
.IsStoreOffsetWithTermVector())
181 AddPosition(fieldName
, t
.TermText(), position
++, new TermVectorOffsetInfo(offset
+ t
.StartOffset(), offset
+ t
.EndOffset()));
183 AddPosition(fieldName
, t
.TermText(), position
++, null);
186 if (++length
> maxFieldLength
)
188 if (infoStream
!= null)
189 infoStream
.WriteLine("maxFieldLength " + maxFieldLength
+ " reached, ignoring following tokens");
194 if (lastToken
!= null)
195 offset
+= lastToken
.EndOffset() + 1;
203 fieldLengths
[fieldNumber
] = length
; // save field length
204 fieldPositions
[fieldNumber
] = position
; // save field position
205 fieldBoosts
[fieldNumber
] *= field
.GetBoost();
206 fieldOffsets
[fieldNumber
] = offset
;
211 private Term termBuffer
= new Term("", ""); // avoid consing
213 private void AddPosition(System
.String field
, System
.String text
, int position
, TermVectorOffsetInfo offset
)
215 termBuffer
.Set(field
, text
);
216 //System.out.println("Offset: " + offset);
217 Posting ti
= (Posting
) postingTable
[termBuffer
];
222 if (ti
.positions
.Length
== freq
)
224 // positions array is full
225 int[] newPositions
= new int[freq
* 2]; // double size
226 int[] positions
= ti
.positions
;
227 for (int i
= 0; i
< freq
; i
++)
228 // copy old positions to new
229 newPositions
[i
] = positions
[i
];
230 ti
.positions
= newPositions
;
232 ti
.positions
[freq
] = position
; // add new position
236 if (ti
.offsets
.Length
== freq
)
238 TermVectorOffsetInfo
[] newOffsets
= new TermVectorOffsetInfo
[freq
* 2];
239 TermVectorOffsetInfo
[] offsets
= ti
.offsets
;
240 for (int i
= 0; i
< freq
; i
++)
242 newOffsets
[i
] = offsets
[i
];
244 ti
.offsets
= newOffsets
;
246 ti
.offsets
[freq
] = offset
;
248 ti
.freq
= freq
+ 1; // update frequency
252 // word not seen before
253 Term term
= new Term(field
, text
, false);
254 postingTable
[term
] = new Posting(term
, position
, offset
);
258 private Posting
[] SortPostingTable()
260 // copy postingTable into an array
261 Posting
[] array
= new Posting
[postingTable
.Count
];
262 System
.Collections
.IEnumerator postings
= postingTable
.Values
.GetEnumerator();
263 for (int i
= 0; postings
.MoveNext(); i
++)
265 array
[i
] = (Posting
) postings
.Current
;
269 QuickSort(array
, 0, array
.Length
- 1);
274 private static void QuickSort(Posting
[] postings
, int lo
, int hi
)
279 int mid
= (lo
+ hi
) / 2;
281 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
283 Posting tmp
= postings
[lo
];
284 postings
[lo
] = postings
[mid
];
288 if (postings
[mid
].term
.CompareTo(postings
[hi
].term
) > 0)
290 Posting tmp
= postings
[mid
];
291 postings
[mid
] = postings
[hi
];
294 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
296 Posting tmp2
= postings
[lo
];
297 postings
[lo
] = postings
[mid
];
298 postings
[mid
] = tmp2
;
308 Term partition
= postings
[mid
].term
;
312 while (postings
[right
].term
.CompareTo(partition
) > 0)
315 while (left
< right
&& postings
[left
].term
.CompareTo(partition
) <= 0)
320 Posting tmp
= postings
[left
];
321 postings
[left
] = postings
[right
];
322 postings
[right
] = tmp
;
331 QuickSort(postings
, lo
, left
);
332 QuickSort(postings
, left
+ 1, hi
);
335 private void WritePostings(Posting
[] postings
, System
.String segment
)
337 IndexOutput freq
= null, prox
= null;
338 TermInfosWriter tis
= null;
339 TermVectorsWriter termVectorWriter
= null;
342 //open files for inverse index storage
343 freq
= directory
.CreateOutput(segment
+ ".frq");
344 prox
= directory
.CreateOutput(segment
+ ".prx");
345 tis
= new TermInfosWriter(directory
, segment
, fieldInfos
, termIndexInterval
);
346 TermInfo ti
= new TermInfo();
347 System
.String currentField
= null;
349 for (int i
= 0; i
< postings
.Length
; i
++)
351 Posting posting
= postings
[i
];
353 // add an entry to the dictionary with pointers to prox and freq files
354 ti
.Set(1, freq
.GetFilePointer(), prox
.GetFilePointer(), - 1);
355 tis
.Add(posting
.term
, ti
);
357 // add an entry to the freq file
358 int postingFreq
= posting
.freq
;
359 if (postingFreq
== 1)
362 // set low bit of doc num.
365 freq
.WriteVInt(0); // the document number
366 freq
.WriteVInt(postingFreq
); // frequency in doc
369 int lastPosition
= 0; // write positions
370 int[] positions
= posting
.positions
;
371 for (int j
= 0; j
< postingFreq
; j
++)
373 // use delta-encoding
374 int position
= positions
[j
];
375 prox
.WriteVInt(position
- lastPosition
);
376 lastPosition
= position
;
378 // check to see if we switched to a new Field
379 System
.String termField
= posting
.term
.Field();
380 if ((System
.Object
) currentField
!= (System
.Object
) termField
)
382 // changing Field - see if there is something to save
383 currentField
= termField
;
384 FieldInfo fi
= fieldInfos
.FieldInfo(currentField
);
385 if (fi
.storeTermVector
)
387 if (termVectorWriter
== null)
389 termVectorWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
390 termVectorWriter
.OpenDocument();
392 termVectorWriter
.OpenField(currentField
);
394 else if (termVectorWriter
!= null)
396 termVectorWriter
.CloseField();
399 if (termVectorWriter
!= null && termVectorWriter
.IsFieldOpen())
401 termVectorWriter
.AddTerm(posting
.term
.Text(), postingFreq
, posting
.positions
, posting
.offsets
);
404 if (termVectorWriter
!= null)
405 termVectorWriter
.CloseDocument();
409 // make an effort to close all streams we can but remember and re-throw
410 // the first exception encountered in this process
411 System
.IO
.IOException keep
= null;
417 catch (System
.IO
.IOException e
)
427 catch (System
.IO
.IOException e
)
437 catch (System
.IO
.IOException e
)
442 if (termVectorWriter
!= null)
445 termVectorWriter
.Close();
447 catch (System
.IO
.IOException e
)
454 throw new System
.IO
.IOException(keep
.StackTrace
); // throw new System.IO.IOException(keep.StackTrace);
459 private void WriteNorms(System
.String segment
)
461 for (int n
= 0; n
< fieldInfos
.Size(); n
++)
463 FieldInfo fi
= fieldInfos
.FieldInfo(n
);
466 float norm
= fieldBoosts
[n
] * similarity
.LengthNorm(fi
.name
, fieldLengths
[n
]);
467 IndexOutput norms
= directory
.CreateOutput(segment
+ ".f" + n
);
470 norms
.WriteByte(Similarity
.EncodeNorm(norm
));
480 /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
481 internal void SetInfoStream(System
.IO
.TextWriter infoStream
)
483 this.infoStream
= infoStream
;
489 // info about a Term in a doc
490 internal Term term
; // the Term
491 internal int freq
; // its frequency in doc
492 internal int[] positions
; // positions it occurs at
493 internal TermVectorOffsetInfo
[] offsets
;
495 internal Posting(Term t
, int position
, TermVectorOffsetInfo offset
)
499 positions
= new int[1];
500 positions
[0] = position
;
503 offsets
= new TermVectorOffsetInfo
[1];