3 using System
.Collections
;
5 using Lucene
.Net
.Documents
;
6 using Lucene
.Net
.Analysis
;
7 using Lucene
.Net
.Store
;
8 using Lucene
.Net
.Search
;
10 namespace Lucene
.Net
.Index
12 /* ====================================================================
13 * The Apache Software License, Version 1.1
15 * Copyright (c) 2001 The Apache Software Foundation. All rights
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in
27 * the documentation and/or other materials provided with the
30 * 3. The end-user documentation included with the redistribution,
31 * if any, must include the following acknowledgment:
32 * "This product includes software developed by the
33 * Apache Software Foundation (http://www.apache.org/)."
34 * Alternately, this acknowledgment may appear in the software itself,
35 * if and wherever such third-party acknowledgments normally appear.
37 * 4. The names "Apache" and "Apache Software Foundation" and
38 * "Apache Lucene" must not be used to endorse or promote products
39 * derived from this software without prior written permission. For
40 * written permission, please contact apache@apache.org.
42 * 5. Products derived from this software may not be called "Apache",
43 * "Apache Lucene", nor may "Apache" appear in their name, without
44 * prior written permission of the Apache Software Foundation.
46 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
47 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
49 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
50 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
51 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
52 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
53 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
54 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
55 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
56 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * ====================================================================
60 * This software consists of voluntary contributions made by many
61 * individuals on behalf of the Apache Software Foundation. For more
62 * information on the Apache Software Foundation, please see
63 * <http://www.apache.org/>.
66 public sealed class DocumentWriter
68 private Analyzer analyzer
;
69 private Lucene
.Net
.Store
.Directory directory
;
70 private Similarity similarity
;
71 private FieldInfos fieldInfos
;
72 private int maxFieldLength
;
74 public DocumentWriter(Lucene
.Net
.Store
.Directory directory
, Analyzer analyzer
,
75 Similarity similarity
, int maxFieldLength
)
77 this.directory
= directory
;
78 this.analyzer
= analyzer
;
79 this.similarity
= similarity
;
80 this.maxFieldLength
= maxFieldLength
;
83 public void AddDocument(String segment
, Document doc
)
86 fieldInfos
= new FieldInfos();
88 fieldInfos
.Write(directory
, segment
+ ".fnm");
91 FieldsWriter fieldsWriter
=
92 new FieldsWriter(directory
, segment
, fieldInfos
);
95 fieldsWriter
.AddDocument(doc
);
102 // invert doc into postingTable
103 postingTable
.Clear(); // clear postingTable
104 fieldLengths
= new int[fieldInfos
.Size()]; // init fieldLengths
106 fieldBoosts
= new float[fieldInfos
.Size()]; // init fieldBoosts
107 float boost
= doc
.GetBoost();
108 for (int i
= 0; i
< fieldBoosts
.Length
; i
++)
110 fieldBoosts
[i
] = boost
;
115 // sort postingTable into an array
116 Posting
[] postings
= SortPostingTable();
119 for (int i = 0; i < postings.length; i++) {
120 Posting posting = postings[i];
121 System.out.print(posting.term);
122 System.out.print(" freq=" + posting.freq);
123 System.out.print(" pos=");
124 System.out.print(posting.positions[0]);
125 for (int j = 1; j < posting.freq; j++)
126 System.out.print("," + posting.positions[j]);
127 System.out.println("");
132 WritePostings(postings
, segment
);
134 // write norms of indexed fields
135 WriteNorms(doc
, segment
);
139 // Keys are Terms, values are Postings.
140 // Used to buffer a document before it is written to the index.
142 private readonly Hashtable postingTable
= new Hashtable();
143 private int[] fieldLengths
;
144 private float[] fieldBoosts
;
147 /// Tokenizes the fields of a document into Postings.
149 /// <param name="doc"></param>
150 private void InvertDocument(Document doc
)
152 foreach (Field field
in doc
.Fields())
154 String fieldName
= field
.Name();
155 int fieldNumber
= fieldInfos
.FieldNumber(fieldName
);
157 int position
= fieldLengths
[fieldNumber
]; // position in field
159 if (field
.IsIndexed())
161 if (!field
.IsTokenized())
162 { // un-tokenized field
163 AddPosition(fieldName
, field
.StringValue(), position
++);
167 TextReader reader
; // find or make Reader
168 if (field
.ReaderValue() != null)
170 reader
= field
.ReaderValue();
172 else if (field
.StringValue() != null)
173 reader
= new StringReader(field
.StringValue());
175 throw new ArgumentException
176 ("field must have either String or Reader value");
178 // Tokenize field and add to postingTable
179 TokenStream stream
= analyzer
.TokenStream(fieldName
, reader
);
182 for (Token t
= stream
.Next(); t
!= null; t
= stream
.Next())
184 position
+= (t
.GetPositionIncrement() - 1);
185 AddPosition(fieldName
, t
.TermText(), position
++);
186 if (position
> maxFieldLength
) break;
195 fieldLengths
[fieldNumber
] = position
; // save field length
196 fieldBoosts
[fieldNumber
] *= field
.GetBoost();
201 private readonly Term termBuffer
= new Term("", ""); // avoid consing
203 private void AddPosition(String field
, String text
, int position
)
205 termBuffer
.Set(field
, text
);
206 Posting ti
= (Posting
)postingTable
[termBuffer
];
208 { // word seen before
210 if (ti
.positions
.Length
== freq
)
211 { // positions array is full
212 int[] newPositions
= new int[freq
* 2]; // double size
213 int[] positions
= ti
.positions
;
214 for (int i
= 0; i
< freq
; i
++) // copy old positions to new
215 newPositions
[i
] = positions
[i
];
216 ti
.positions
= newPositions
;
218 ti
.positions
[freq
] = position
; // add new position
219 ti
.freq
= freq
+ 1; // update frequency
222 { // word not seen before
223 Term term
= new Term(field
, text
, false);
224 postingTable
.Add(term
, new Posting(term
, position
));
228 private Posting
[] SortPostingTable()
230 // copy postingTable into an array
231 Posting
[] array
= new Posting
[postingTable
.Count
];
234 foreach (Posting posting
in postingTable
.Values
)
241 QuickSort(array
, 0, array
.Length
- 1);
246 private static void QuickSort(Posting
[] postings
, int lo
, int hi
)
251 int mid
= (lo
+ hi
) / 2;
253 if(postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
255 Posting tmp
= postings
[lo
];
256 postings
[lo
] = postings
[mid
];
260 if(postings
[mid
].term
.CompareTo(postings
[hi
].term
) > 0)
262 Posting tmp
= postings
[mid
];
263 postings
[mid
] = postings
[hi
];
266 if(postings
[lo
].term
.CompareTo(postings
[mid
].term
) > 0)
268 Posting tmp2
= postings
[lo
];
269 postings
[lo
] = postings
[mid
];
270 postings
[mid
] = tmp2
;
280 Term partition
= postings
[mid
].term
;
284 while(postings
[right
].term
.CompareTo(partition
) > 0)
287 while(left
< right
&& postings
[left
].term
.CompareTo(partition
) <= 0)
292 Posting tmp
= postings
[left
];
293 postings
[left
] = postings
[right
];
294 postings
[right
] = tmp
;
303 QuickSort(postings
, lo
, left
);
304 QuickSort(postings
, left
+ 1, hi
);
307 private void WritePostings(Posting
[] postings
, String segment
)
309 OutputStream freq
= null, prox
= null;
310 TermInfosWriter tis
= null;
314 freq
= directory
.CreateFile(segment
+ ".frq");
315 prox
= directory
.CreateFile(segment
+ ".prx");
316 tis
= new TermInfosWriter(directory
, segment
, fieldInfos
);
317 TermInfo ti
= new TermInfo();
319 for (int i
= 0; i
< postings
.Length
; i
++)
321 Posting posting
= postings
[i
];
323 // add an entry to the dictionary with pointers to prox and freq files
324 ti
.Set(1, freq
.GetFilePointer(), prox
.GetFilePointer());
325 tis
.Add(posting
.term
, ti
);
327 // add an entry to the freq file
328 int f
= posting
.freq
;
329 if (f
== 1) // optimize freq=1
330 freq
.WriteVInt(1); // set low bit of doc num.
333 freq
.WriteVInt(0); // the document number
334 freq
.WriteVInt(f
); // frequency in doc
337 int lastPosition
= 0; // write positions
338 int[] positions
= posting
.positions
;
339 for (int j
= 0; j
< f
; j
++)
340 { // use delta-encoding
341 int position
= positions
[j
];
342 prox
.WriteVInt(position
- lastPosition
);
343 lastPosition
= position
;
349 if (freq
!= null) freq
.Close();
350 if (prox
!= null) prox
.Close();
351 if (tis
!= null) tis
.Close();
355 private void WriteNorms(Document doc
, String segment
)
357 foreach(Field field
in doc
.Fields())
359 if (field
.IsIndexed())
361 int n
= fieldInfos
.FieldNumber(field
.Name());
363 fieldBoosts
[n
] * similarity
.LengthNorm(field
.Name(),fieldLengths
[n
]);
364 OutputStream norms
= directory
.CreateFile(segment
+ ".f" + n
);
367 norms
.WriteByte(Similarity
.EncodeNorm(norm
));
380 // info about a Term in a doc
381 internal Term term
; // the Term
382 internal int freq
; // its frequency in doc
383 internal int[] positions
; // positions it occurs at
385 internal Posting(Term t
, int position
)
389 positions
= new int[1];
390 positions
[0] = position
;