2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Analyzer
= Lucene
.Net
.Analysis
.Analyzer
;
18 using Token
= Lucene
.Net
.Analysis
.Token
;
19 using TokenStream
= Lucene
.Net
.Analysis
.TokenStream
;
20 using Document
= Lucene
.Net
.Documents
.Document
;
21 using Field
= Lucene
.Net
.Documents
.Field
;
22 using Similarity
= Lucene
.Net
.Search
.Similarity
;
23 using Directory
= Lucene
.Net
.Store
.Directory
;
24 using OutputStream
= Lucene
.Net
.Store
.OutputStream
;
25 namespace Lucene
.Net
.Index
28 sealed public class DocumentWriter
30 private Analyzer analyzer
;
31 private Directory directory
;
32 private Similarity similarity
;
33 private FieldInfos fieldInfos
;
34 private int maxFieldLength
;
36 /// <summary> </summary>
37 /// <param name="directory">The directory to write the document information to
39 /// <param name="analyzer">The analyzer to use for the document
41 /// <param name="similarity">The Similarity function
43 /// <param name="maxFieldLength">The maximum number of tokens a Field may have
45 public /*internal*/ DocumentWriter(Directory directory
, Analyzer analyzer
, Similarity similarity
, int maxFieldLength
)
47 this.directory
= directory
;
48 this.analyzer
= analyzer
;
49 this.similarity
= similarity
;
50 this.maxFieldLength
= maxFieldLength
;
53 /*internal*/ public void AddDocument(System
.String segment
, Document doc
)
56 fieldInfos
= new FieldInfos();
58 fieldInfos
.Write(directory
, segment
+ ".fnm");
61 FieldsWriter fieldsWriter
= new FieldsWriter(directory
, segment
, fieldInfos
);
64 fieldsWriter
.AddDocument(doc
);
71 // invert doc into postingTable
72 postingTable
.Clear(); // clear postingTable
73 fieldLengths
= new int[fieldInfos
.Size()]; // init fieldLengths
74 fieldPositions
= new int[fieldInfos
.Size()]; // init fieldPositions
76 fieldBoosts
= new float[fieldInfos
.Size()]; // init fieldBoosts
77 float boost
= doc
.GetBoost();
78 for (int i
= 0; i
< fieldBoosts
.Length
; i
++)
80 fieldBoosts
[i
] = boost
;
85 // sort postingTable into an array
86 Posting
[] postings
= SortPostingTable();
89 for (int i = 0; i < postings.length; i++) {
90 Posting posting = postings[i];
91 System.out.print(posting.term);
92 System.out.print(" freq=" + posting.freq);
93 System.out.print(" pos=");
94 System.out.print(posting.positions[0]);
95 for (int j = 1; j < posting.freq; j++)
96 System.out.print("," + posting.positions[j]);
97 System.out.println("");
102 WritePostings(postings
, segment
);
104 // write norms of indexed fields
105 WriteNorms(doc
, segment
);
108 // Keys are Terms, values are Postings.
109 // Used to buffer a document before it is written to the index.
110 private System
.Collections
.Hashtable postingTable
= System
.Collections
.Hashtable
.Synchronized(new System
.Collections
.Hashtable());
111 private int[] fieldLengths
;
112 private int[] fieldPositions
;
113 private float[] fieldBoosts
;
115 // Tokenizes the fields of a document into Postings.
116 private void InvertDocument(Document doc
)
118 foreach(Field field
in doc
.Fields())
120 System
.String fieldName
= field
.Name();
121 int fieldNumber
= fieldInfos
.FieldNumber(fieldName
);
123 int length
= fieldLengths
[fieldNumber
]; // length of Field
124 int position
= fieldPositions
[fieldNumber
]; // position in Field
126 if (field
.IsIndexed())
128 if (!field
.IsTokenized())
130 // un-tokenized Field
131 AddPosition(fieldName
, field
.StringValue(), position
++);
136 System
.IO
.TextReader reader
; // find or make Reader
137 if (field
.ReaderValue() != null)
138 reader
= field
.ReaderValue();
139 else if (field
.StringValue() != null)
140 reader
= new System
.IO
.StringReader(field
.StringValue());
142 throw new System
.ArgumentException("Field must have either String or Reader value");
144 // Tokenize Field and add to postingTable
145 TokenStream stream
= analyzer
.TokenStream(fieldName
, reader
);
148 for (Token t
= stream
.Next(); t
!= null; t
= stream
.Next())
150 position
+= (t
.GetPositionIncrement() - 1);
151 AddPosition(fieldName
, t
.TermText(), position
++);
152 if (++length
> maxFieldLength
)
162 fieldLengths
[fieldNumber
] = length
; // save Field length
163 fieldPositions
[fieldNumber
] = position
; // save Field position
164 fieldBoosts
[fieldNumber
] *= field
.GetBoost();
169 private Term termBuffer
= new Term("", ""); // avoid consing
171 private void AddPosition(System
.String field
, System
.String text
, int position
)
173 termBuffer
.Set(field
, text
);
174 Posting ti
= (Posting
) postingTable
[termBuffer
];
179 if (ti
.positions
.Length
== freq
)
181 // positions array is full
182 int[] newPositions
= new int[freq
* 2]; // double size
183 int[] positions
= ti
.positions
;
184 for (int i
= 0; i
< freq
; i
++)
185 // copy old positions to new
186 newPositions
[i
] = positions
[i
];
187 ti
.positions
= newPositions
;
189 ti
.positions
[freq
] = position
; // add new position
190 ti
.freq
= freq
+ 1; // update frequency
194 // word not seen before
195 Term term
= new Term(field
, text
, false);
196 postingTable
[term
] = new Posting(term
, position
);
200 private Posting
[] SortPostingTable()
202 // copy postingTable into an array
203 Posting
[] array
= new Posting
[postingTable
.Count
];
204 System
.Collections
.IEnumerator postings
= postingTable
.Values
.GetEnumerator();
205 for (int i
= 0; postings
.MoveNext(); i
++)
207 array
[i
] = (Posting
) postings
.Current
;
211 QuickSort(array
, 0, array
.Length
- 1);
216 private static void QuickSort(Posting
[] postings
, int lo
, int hi
)
221 int mid
= (lo
+ hi
) / 2;
223 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
225 Posting tmp
= postings
[lo
];
226 postings
[lo
] = postings
[mid
];
230 if (postings
[mid
].term
.CompareTo(postings
[hi
].term
) > 0)
232 Posting tmp
= postings
[mid
];
233 postings
[mid
] = postings
[hi
];
236 if (postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
238 Posting tmp2
= postings
[lo
];
239 postings
[lo
] = postings
[mid
];
240 postings
[mid
] = tmp2
;
250 Term partition
= postings
[mid
].term
;
254 while (postings
[right
].term
.CompareTo(partition
) > 0)
257 while (left
< right
&& postings
[left
].term
.CompareTo(partition
) <= 0)
262 Posting tmp
= postings
[left
];
263 postings
[left
] = postings
[right
];
264 postings
[right
] = tmp
;
273 QuickSort(postings
, lo
, left
);
274 QuickSort(postings
, left
+ 1, hi
);
277 private void WritePostings(Posting
[] postings
, System
.String segment
)
279 OutputStream freq
= null, prox
= null;
280 TermInfosWriter tis
= null;
281 TermVectorsWriter termVectorWriter
= null;
284 //open files for inverse index storage
285 freq
= directory
.CreateFile(segment
+ ".frq");
286 prox
= directory
.CreateFile(segment
+ ".prx");
287 tis
= new TermInfosWriter(directory
, segment
, fieldInfos
);
288 TermInfo ti
= new TermInfo();
289 System
.String currentField
= null;
291 for (int i
= 0; i
< postings
.Length
; i
++)
293 Posting posting
= postings
[i
];
295 // add an entry to the dictionary with pointers to prox and freq files
296 ti
.Set(1, freq
.GetFilePointer(), prox
.GetFilePointer(), - 1);
297 tis
.Add(posting
.term
, ti
);
299 // add an entry to the freq file
300 int postingFreq
= posting
.freq
;
301 if (postingFreq
== 1)
304 // set low bit of doc num.
307 freq
.WriteVInt(0); // the document number
308 freq
.WriteVInt(postingFreq
); // frequency in doc
311 int lastPosition
= 0; // write positions
312 int[] positions
= posting
.positions
;
313 for (int j
= 0; j
< postingFreq
; j
++)
315 // use delta-encoding
316 int position
= positions
[j
];
317 prox
.WriteVInt(position
- lastPosition
);
318 lastPosition
= position
;
320 // check to see if we switched to a new Field
321 System
.String termField
= posting
.term
.Field();
322 if ((System
.Object
) currentField
!= (System
.Object
) termField
)
324 // changing Field - see if there is something to save
325 currentField
= termField
;
326 FieldInfo fi
= fieldInfos
.FieldInfo(currentField
);
327 if (fi
.storeTermVector
)
329 if (termVectorWriter
== null)
331 termVectorWriter
= new TermVectorsWriter(directory
, segment
, fieldInfos
);
332 termVectorWriter
.OpenDocument();
334 termVectorWriter
.OpenField(currentField
);
336 else if (termVectorWriter
!= null)
338 termVectorWriter
.CloseField();
341 if (termVectorWriter
!= null && termVectorWriter
.IsFieldOpen())
343 termVectorWriter
.AddTerm(posting
.term
.Text(), postingFreq
);
346 if (termVectorWriter
!= null)
347 termVectorWriter
.CloseDocument();
351 // make an effort to close all streams we can but remember and re-throw
352 // the first exception encountered in this process
353 System
.IO
.IOException keep
= null;
359 catch (System
.IO
.IOException e
)
369 catch (System
.IO
.IOException e
)
379 catch (System
.IO
.IOException e
)
384 if (termVectorWriter
!= null)
387 termVectorWriter
.Close();
389 catch (System
.IO
.IOException e
)
396 throw new System
.IO
.IOException(keep
.StackTrace
);
401 private void WriteNorms(Document doc
, System
.String segment
)
403 for (int n
= 0; n
< fieldInfos
.Size(); n
++)
405 FieldInfo fi
= fieldInfos
.FieldInfo(n
);
408 float norm
= fieldBoosts
[n
] * similarity
.LengthNorm(fi
.name
, fieldLengths
[n
]);
409 OutputStream norms
= directory
.CreateFile(segment
+ ".f" + n
);
412 norms
.WriteByte(Lucene
.Net
.Search
.Similarity
.EncodeNorm(norm
));
425 // info about a Term in a doc
426 internal Term term
; // the Term
427 internal int freq
; // its frequency in doc
428 internal int[] positions
; // positions it occurs at
430 internal Posting(Term t
, int position
)
434 positions
= new int[1];
435 positions
[0] = position
;