2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 using Analyzer
= Lucene
.Net
.Analysis
.Analyzer
;
19 using Token
= Lucene
.Net
.Analysis
.Token
;
20 using TokenStream
= Lucene
.Net
.Analysis
.TokenStream
;
21 using Document
= Lucene
.Net
.Documents
.Document
;
22 using Field
= Lucene
.Net
.Documents
.Field
;
23 using Similarity
= Lucene
.Net
.Search
.Similarity
;
24 using Directory
= Lucene
.Net
.Store
.Directory
;
25 using IndexOutput
= Lucene
.Net
.Store
.IndexOutput
;
27 namespace Lucene
.Net
.Index
30 public sealed class DocumentWriter
32 private void InitBlock()
34 termIndexInterval
= IndexWriter
.DEFAULT_TERM_INDEX_INTERVAL
;
36 private Analyzer analyzer
;
37 private Directory directory
;
38 private Similarity similarity
;
39 private FieldInfos fieldInfos
;
40 private int maxFieldLength
;
41 private int termIndexInterval
;
42 private System
.IO
.TextWriter infoStream
;
44 /// <summary>This ctor used by test code only.
47 /// <param name="directory">The directory to write the document information to
49 /// <param name="analyzer">The analyzer to use for the document
51 /// <param name="similarity">The Similarity function
53 /// <param name="maxFieldLength">The maximum number of tokens a field may have
55 public DocumentWriter(Directory directory
, Analyzer analyzer
, Similarity similarity
, int maxFieldLength
)
58 this.directory
= directory
;
59 this.analyzer
= analyzer
;
60 this.similarity
= similarity
;
61 this.maxFieldLength
= maxFieldLength
;
64 public DocumentWriter(Directory directory
, Analyzer analyzer
, IndexWriter writer
)
67 this.directory
= directory
;
68 this.analyzer
= analyzer
;
69 this.similarity
= writer
.GetSimilarity();
70 this.maxFieldLength
= writer
.GetMaxFieldLength();
71 this.termIndexInterval
= writer
.GetTermIndexInterval();
74 public /*internal*/ void AddDocument(System
.String segment
, Document doc
)
77 fieldInfos
= new FieldInfos();
79 fieldInfos
.Write(directory
, segment
+ ".fnm");
82 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
85 fieldsWriter
.AddDocument(doc
);
92 // invert doc into postingTable
93 postingTable
.Clear(); // clear postingTable
94 fieldLengths
= new int[fieldInfos
.Size()]; // init fieldLengths
95 fieldPositions
= new int[fieldInfos
.Size()]; // init fieldPositions
96 fieldOffsets
= new int[fieldInfos
.Size()]; // init fieldOffsets
98 fieldBoosts
= new float[fieldInfos
.Size()]; // init fieldBoosts
99 float boost
= doc
.GetBoost();
100 for (int i
= 0; i
< fieldBoosts
.Length
; i
++)
102 fieldBoosts
[i
] = boost
;
107 // sort postingTable into an array
108 Posting
[] postings
= SortPostingTable();
111 for (int i = 0; i < postings.length; i++) {
112 Posting posting = postings[i];
113 System.out.print(posting.term);
114 System.out.print(" freq=" + posting.freq);
115 System.out.print(" pos=");
116 System.out.print(posting.positions[0]);
117 for (int j = 1; j < posting.freq; j++)
118 System.out.print("," + posting.positions[j]);
119 System.out.println("");
124 WritePostings(postings
, segment
);
126 // write norms of indexed fields
130 // Keys are Terms, values are Postings.
131 // Used to buffer a document before it is written to the index.
132 private System
.Collections
.Hashtable postingTable
= System
.Collections
.Hashtable
.Synchronized(new System
.Collections
.Hashtable());
133 private int[] fieldLengths
;
134 private int[] fieldPositions
;
135 private int[] fieldOffsets
;
136 private float[] fieldBoosts
;
138 // Tokenizes the fields of a document into Postings.
139 private void InvertDocument(Document doc
)
141 foreach(Field field
in doc
.Fields())
143 System
.String fieldName
= field
.Name();
144 int fieldNumber
= fieldInfos
.FieldNumber(fieldName
);
146 int length
= fieldLengths
[fieldNumber
]; // length of field
147 int position
= fieldPositions
[fieldNumber
]; // position in field
149 position
+= analyzer
.GetPositionIncrementGap(fieldName
);
150 int offset
= fieldOffsets
[fieldNumber
]; // offset field
152 if (field
.IsIndexed())
154 if (!field
.IsTokenized())
156 // un-tokenized field
157 System
.String stringValue
= field
.StringValue();
158 if (field
.IsStoreOffsetWithTermVector())
159 AddPosition(fieldName
, stringValue
, position
++, new TermVectorOffsetInfo(offset
, offset
+ stringValue
.Length
));
161 AddPosition(fieldName
, stringValue
, position
++, null);
162 offset
+= stringValue
.Length
;
167 System
.IO
.TextReader reader
; // find or make Reader
168 if (field
.ReaderValue() != null)
169 reader
= field
.ReaderValue();
170 else if (field
.StringValue() != null)
171 reader
= new System
.IO
.StringReader(field
.StringValue());
173 throw new System
.ArgumentException("field must have either String or Reader value");
175 // Tokenize field and add to postingTable
176 TokenStream stream
= analyzer
.TokenStream(fieldName
, reader
);
179 Token lastToken
= null;
180 for (Token t
= stream
.Next(); t
!= null; t
= stream
.Next())
182 position
+= (t
.GetPositionIncrement() - 1);
184 if (field
.IsStoreOffsetWithTermVector())
185 AddPosition(fieldName
, t
.TermText(), position
++, new TermVectorOffsetInfo(offset
+ t
.StartOffset(), offset
+ t
.EndOffset()));
187 AddPosition(fieldName
, t
.TermText(), position
++, null);
190 if (++length
> maxFieldLength
)
192 if (infoStream
!= null)
193 infoStream
.WriteLine("maxFieldLength " + maxFieldLength
+ " reached, ignoring following tokens");
198 if (lastToken
!= null)
199 offset
+= lastToken
.EndOffset() + 1;
207 fieldLengths
[fieldNumber
] = length
; // save field length
208 fieldPositions
[fieldNumber
] = position
; // save field position
209 fieldBoosts
[fieldNumber
] *= field
.GetBoost();
210 fieldOffsets
[fieldNumber
] = offset
;
215 private Term termBuffer
= new Term("", ""); // avoid consing
217 private void AddPosition(System
.String field
, System
.String text
, int position
, TermVectorOffsetInfo offset
)
219 termBuffer
.Set(field
, text
);
220 //System.out.println("Offset: " + offset);
221 Posting ti
= (Posting
) postingTable
[termBuffer
];
226 if (ti
.positions
.Length
== freq
)
228 // positions array is full
229 int[] newPositions
= new int[freq
* 2]; // double size
230 int[] positions
= ti
.positions
;
231 for (int i
= 0; i
< freq
; i
++)
232 // copy old positions to new
233 newPositions
[i
] = positions
[i
];
234 ti
.positions
= newPositions
;
236 ti
.positions
[freq
] = position
; // add new position
240 if (ti
.offsets
.Length
== freq
)
242 TermVectorOffsetInfo
[] newOffsets
= new TermVectorOffsetInfo
[freq
* 2];
243 TermVectorOffsetInfo
[] offsets
= ti
.offsets
;
244 for (int i
= 0; i
< freq
; i
++)
246 newOffsets
[i
] = offsets
[i
];
248 ti
.offsets
= newOffsets
;
250 ti
.offsets
[freq
] = offset
;
252 ti
.freq
= freq
+ 1; // update frequency
256 // word not seen before
257 Term term
= new Term(field
, text
, false);
258 postingTable
[term
] = new Posting(term
, position
, offset
);
262 private Posting
[] SortPostingTable()
264 // copy postingTable into an array
265 Posting
[] array
= new Posting
[postingTable
.Count
];
266 System
.Collections
.IEnumerator postings
= postingTable
.Values
.GetEnumerator();
267 for (int i
= 0; postings
.MoveNext(); i
++)
269 array
[i
] = (Posting
) postings
.Current
;
273 QuickSort(array
, 0, array
.Length
- 1);
278 private static void QuickSort(Posting
[] postings
, int lo
, int hi
)
283 int mid
= (lo
+ hi
) / 2;
285 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
287 Posting tmp
= postings
[lo
];
288 postings
[lo
] = postings
[mid
];
292 if (postings
[mid
].term
.CompareTo(postings
[hi
].term
) > 0)
294 Posting tmp
= postings
[mid
];
295 postings
[mid
] = postings
[hi
];
298 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
300 Posting tmp2
= postings
[lo
];
301 postings
[lo
] = postings
[mid
];
302 postings
[mid
] = tmp2
;
312 Term partition
= postings
[mid
].term
;
316 while (postings
[right
].term
.CompareTo(partition
) > 0)
319 while (left
< right
&& postings
[left
].term
.CompareTo(partition
) <= 0)
324 Posting tmp
= postings
[left
];
325 postings
[left
] = postings
[right
];
326 postings
[right
] = tmp
;
335 QuickSort(postings
, lo
, left
);
336 QuickSort(postings
, left
+ 1, hi
);
339 private void WritePostings(Posting
[] postings
, System
.String segment
)
341 IndexOutput freq
= null, prox
= null;
342 TermInfosWriter tis
= null;
343 TermVectorsWriter termVectorWriter
= null;
346 //open files for inverse index storage
347 freq
= directory
.CreateOutput(segment
+ ".frq");
348 prox
= directory
.CreateOutput(segment
+ ".prx");
349 tis
= new TermInfosWriter(directory
, segment
, fieldInfos
, termIndexInterval
);
350 TermInfo ti
= new TermInfo();
351 System
.String currentField
= null;
353 for (int i
= 0; i
< postings
.Length
; i
++)
355 Posting posting
= postings
[i
];
357 // add an entry to the dictionary with pointers to prox and freq files
358 ti
.Set(1, freq
.GetFilePointer(), prox
.GetFilePointer(), - 1);
359 tis
.Add(posting
.term
, ti
);
361 // add an entry to the freq file
362 int postingFreq
= posting
.freq
;
363 if (postingFreq
== 1)
366 // set low bit of doc num.
369 freq
.WriteVInt(0); // the document number
370 freq
.WriteVInt(postingFreq
); // frequency in doc
373 int lastPosition
= 0; // write positions
374 int[] positions
= posting
.positions
;
375 for (int j
= 0; j
< postingFreq
; j
++)
377 // use delta-encoding
378 int position
= positions
[j
];
379 prox
.WriteVInt(position
- lastPosition
);
380 lastPosition
= position
;
382 // check to see if we switched to a new field
383 System
.String termField
= posting
.term
.Field();
384 if (currentField
!= termField
)
386 // changing field - see if there is something to save
387 currentField
= termField
;
388 FieldInfo fi
= fieldInfos
.FieldInfo(currentField
);
389 if (fi
.storeTermVector
)
391 if (termVectorWriter
== null)
393 termVectorWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
394 termVectorWriter
.OpenDocument();
396 termVectorWriter
.OpenField(currentField
);
398 else if (termVectorWriter
!= null)
400 termVectorWriter
.CloseField();
403 if (termVectorWriter
!= null && termVectorWriter
.IsFieldOpen())
405 termVectorWriter
.AddTerm(posting
.term
.Text(), postingFreq
, posting
.positions
, posting
.offsets
);
408 if (termVectorWriter
!= null)
409 termVectorWriter
.CloseDocument();
413 // make an effort to close all streams we can but remember and re-throw
414 // the first exception encountered in this process
415 System
.IO
.IOException keep
= null;
421 catch (System
.IO
.IOException e
)
431 catch (System
.IO
.IOException e
)
441 catch (System
.IO
.IOException e
)
446 if (termVectorWriter
!= null)
449 termVectorWriter
.Close();
451 catch (System
.IO
.IOException e
)
458 throw new System
.IO
.IOException(keep
.StackTrace
);
463 private void WriteNorms(System
.String segment
)
465 for (int n
= 0; n
< fieldInfos
.Size(); n
++)
467 FieldInfo fi
= fieldInfos
.FieldInfo(n
);
468 if (fi
.isIndexed
&& !fi
.omitNorms
)
470 float norm
= fieldBoosts
[n
] * similarity
.LengthNorm(fi
.name
, fieldLengths
[n
]);
471 IndexOutput norms
= directory
.CreateOutput(segment
+ ".f" + n
);
474 norms
.WriteByte(Similarity
.EncodeNorm(norm
));
484 /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
485 internal void SetInfoStream(System
.IO
.TextWriter infoStream
)
487 this.infoStream
= infoStream
;
493 // info about a Term in a doc
494 internal Term term
; // the Term
495 internal int freq
; // its frequency in doc
496 internal int[] positions
; // positions it occurs at
497 internal TermVectorOffsetInfo
[] offsets
;
499 internal Posting(Term t
, int position
, TermVectorOffsetInfo offset
)
503 positions
= new int[1];
504 positions
[0] = position
;
507 offsets
= new TermVectorOffsetInfo
[1];