cvsimport
[beagle.git] / beagled / Lucene.Net / Search / Similarity.cs
blob5e1056f4b3f28bef03e07dedb9c3091a33a0d527
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using System;
18 using Field = Lucene.Net.Documents.Field;
19 using IndexReader = Lucene.Net.Index.IndexReader;
20 using IndexWriter = Lucene.Net.Index.IndexWriter;
21 using Term = Lucene.Net.Index.Term;
22 using SmallFloat = Lucene.Net.Util.SmallFloat;
24 namespace Lucene.Net.Search
27 /// <summary>Expert: Scoring API.
28 /// <p>Subclasses implement search scoring.
29 ///
30 /// <p>The score of query <code>q</code> for document <code>d</code> is defined
31 /// in terms of these methods as follows:
32 ///
33 /// <table cellpadding="0" cellspacing="0" border="0">
34 /// <tr>
35 /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
36 /// <td valign="middle" align="center">
37 /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
38 /// <td valign="middle"><small>
39 /// ( {@link #Tf(int) tf}(t in d) *
40 /// {@link #Idf(Term,Searcher) idf}(t)^2 *
41 /// {@link Query#getBoost getBoost}(t in q) *
42 /// {@link Field#getBoost getBoost}(t.field in d) *
43 /// {@link #LengthNorm(String,int) lengthNorm}(t.field in d) )
44 /// </small></td>
45 /// <td valign="middle" rowspan="2">&nbsp;*
46 /// {@link #Coord(int,int) coord}(q,d) *
47 /// {@link #QueryNorm(float) queryNorm}(sumOfSqaredWeights)
48 /// </td>
49 /// </tr>
50 /// <tr>
51 /// <td valign="top" align="right">
52 /// <small>t in q</small>
53 /// </td>
54 /// </tr>
55 /// </table>
56 ///
57 /// <p> where
58 ///
59 /// <table cellpadding="0" cellspacing="0" border="0">
60 /// <tr>
61 /// <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td>
62 /// <td valign="middle" align="center">
63 /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
64 /// <td valign="middle"><small>
65 /// ( {@link #Idf(Term,Searcher) idf}(t) *
66 /// {@link Query#getBoost getBoost}(t in q) )^2
67 /// </small></td>
68 /// </tr>
69 /// <tr>
70 /// <td valign="top" align="right">
71 /// <small>t in q</small>
72 /// </td>
73 /// </tr>
74 /// </table>
75 ///
76 /// <p> Note that the above formula is motivated by the cosine-distance or dot-product
77 /// between document and query vector, which is implemented by {@link DefaultSimilarity}.
78 ///
79 /// </summary>
80 /// <seealso cref="SetDefault(Similarity)">
81 /// </seealso>
82 /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
83 /// </seealso>
84 /// <seealso cref="Searcher.SetSimilarity(Similarity)">
85 /// </seealso>
86 [Serializable]
87 public abstract class Similarity
89 /// <summary>The Similarity implementation used by default. </summary>
90 private static Similarity defaultImpl = new DefaultSimilarity();
92 /// <summary>Set the default Similarity implementation used by indexing and search
93 /// code.
94 ///
95 /// </summary>
96 /// <seealso cref="Searcher.SetSimilarity(Similarity)">
97 /// </seealso>
98 /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
99 /// </seealso>
100 public static void SetDefault(Similarity similarity)
102 Similarity.defaultImpl = similarity;
105 /// <summary>Return the default Similarity implementation used by indexing and search
106 /// code.
107 ///
108 /// <p>This is initially an instance of {@link DefaultSimilarity}.
109 ///
110 /// </summary>
111 /// <seealso cref="Searcher.SetSimilarity(Similarity)">
112 /// </seealso>
113 /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
114 /// </seealso>
115 public static Similarity GetDefault()
117 return Similarity.defaultImpl;
120 /// <summary>Cache of decoded bytes. </summary>
121 private static readonly float[] NORM_TABLE = new float[256];
123 /// <summary>Decodes a normalization factor stored in an index.</summary>
124 /// <seealso cref="EncodeNorm(float)">
125 /// </seealso>
126 public static float DecodeNorm(byte b)
128 return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
131 /// <summary>Returns a table for decoding normalization bytes.</summary>
132 /// <seealso cref="EncodeNorm(float)">
133 /// </seealso>
134 public static float[] GetNormDecoder()
136 return NORM_TABLE;
139 /// <summary>Computes the normalization value for a field given the total number of
140 /// terms contained in a field. These values, together with field boosts, are
141 /// stored in an index and multipled into scores for hits on each field by the
142 /// search code.
143 ///
144 /// <p>Matches in longer fields are less precise, so implementations of this
145 /// method usually return smaller values when <code>numTokens</code> is large,
146 /// and larger values when <code>numTokens</code> is small.
147 ///
148 /// <p>That these values are computed under {@link
149 /// IndexWriter#AddDocument(Lucene.Net.document.Document)} and stored then using
150 /// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents
151 /// must be re-indexed if this method is altered.
152 ///
153 /// </summary>
154 /// <param name="fieldName">the name of the field
155 /// </param>
156 /// <param name="numTokens">the total number of tokens contained in fields named
157 /// <i>fieldName</i> of <i>doc</i>.
158 /// </param>
159 /// <returns> a normalization factor for hits on this field of this document
160 ///
161 /// </returns>
162 /// <seealso cref="Field.SetBoost(float)">
163 /// </seealso>
164 public abstract float LengthNorm(System.String fieldName, int numTokens);
166 /// <summary>Computes the normalization value for a query given the sum of the squared
167 /// weights of each of the query terms. This value is then multipled into the
168 /// weight of each query term.
169 ///
170 /// <p>This does not affect ranking, but rather just attempts to make scores
171 /// from different queries comparable.
172 ///
173 /// </summary>
174 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
175 /// </param>
176 /// <returns> a normalization factor for query weights
177 /// </returns>
178 public abstract float QueryNorm(float sumOfSquaredWeights);
180 /// <summary>Encodes a normalization factor for storage in an index.
181 ///
182 /// <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
183 /// the zero-exponent point at 15, thus
184 /// representing values from around 7x10^9 to 2x10^-9 with about one
185 /// significant decimal digit of accuracy. Zero is also represented.
186 /// Negative numbers are rounded up to zero. Values too large to represent
187 /// are rounded down to the largest representable value. Positive values too
188 /// small to represent are rounded up to the smallest positive representable
189 /// value.
190 ///
191 /// </summary>
192 /// <seealso cref="Field.SetBoost(float)">
193 /// </seealso>
194 /// <seealso cref="SmallFloat">
195 /// </seealso>
196 public static byte EncodeNorm(float f)
198 return (byte) SmallFloat.FloatToByte315(f);
202 /// <summary>Computes a score factor based on a term or phrase's frequency in a
203 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
204 /// factor for each term in the query and these products are then summed to
205 /// form the initial score for a document.
206 ///
207 /// <p>Terms and phrases repeated in a document indicate the topic of the
208 /// document, so implementations of this method usually return larger values
209 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
210 /// is small.
211 ///
212 /// <p>The default implementation calls {@link #Tf(float)}.
213 ///
214 /// </summary>
215 /// <param name="freq">the frequency of a term within a document
216 /// </param>
217 /// <returns> a score factor based on a term's within-document frequency
218 /// </returns>
219 public virtual float Tf(int freq)
221 return Tf((float) freq);
224 /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
225 /// This value is summed for each sloppy phrase match in a document to form
226 /// the frequency that is passed to {@link #Tf(float)}.
227 ///
228 /// <p>A phrase match with a small edit distance to a document passage more
229 /// closely matches the document, so implementations of this method usually
230 /// return larger values when the edit distance is small and smaller values
231 /// when it is large.
232 ///
233 /// </summary>
234 /// <seealso cref="PhraseQuery.SetSlop(int)">
235 /// </seealso>
236 /// <param name="distance">the edit distance of this sloppy phrase match
237 /// </param>
238 /// <returns> the frequency increment for this match
239 /// </returns>
240 public abstract float SloppyFreq(int distance);
242 /// <summary>Computes a score factor based on a term or phrase's frequency in a
243 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
244 /// factor for each term in the query and these products are then summed to
245 /// form the initial score for a document.
246 ///
247 /// <p>Terms and phrases repeated in a document indicate the topic of the
248 /// document, so implementations of this method usually return larger values
249 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
250 /// is small.
251 ///
252 /// </summary>
253 /// <param name="freq">the frequency of a term within a document
254 /// </param>
255 /// <returns> a score factor based on a term's within-document frequency
256 /// </returns>
257 public abstract float Tf(float freq);
259 /// <summary>Computes a score factor for a simple term.
260 ///
261 /// <p>The default implementation is:<pre>
262 /// return idf(searcher.docFreq(term), searcher.maxDoc());
263 /// </pre>
264 ///
265 /// Note that {@link Searcher#MaxDoc()} is used instead of
266 /// {@link IndexReader#NumDocs()} because it is proportional to
267 /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
268 /// so is the other, and in the same direction.
269 ///
270 /// </summary>
271 /// <param name="term">the term in question
272 /// </param>
273 /// <param name="searcher">the document collection being searched
274 /// </param>
275 /// <returns> a score factor for the term
276 /// </returns>
277 public virtual float Idf(Term term, Searcher searcher)
279 return Ldf(searcher.DocFreq(term), searcher.MaxDoc());
282 /// <summary>Computes a score factor for a phrase.
283 ///
284 /// <p>The default implementation sums the {@link #Idf(Term,Searcher)} factor
285 /// for each term in the phrase.
286 ///
287 /// </summary>
288 /// <param name="terms">the terms in the phrase
289 /// </param>
290 /// <param name="searcher">the document collection being searched
291 /// </param>
292 /// <returns> a score factor for the phrase
293 /// </returns>
294 public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
296 float idf = 0.0f;
297 System.Collections.IEnumerator i = terms.GetEnumerator();
298 while (i.MoveNext())
300 idf += Idf((Term) i.Current, searcher);
302 return idf;
305 /// <summary>Computes a score factor based on a term's document frequency (the number
306 /// of documents which contain the term). This value is multiplied by the
307 /// {@link #Tf(int)} factor for each term in the query and these products are
308 /// then summed to form the initial score for a document.
309 ///
310 /// <p>Terms that occur in fewer documents are better indicators of topic, so
311 /// implementations of this method usually return larger values for rare terms,
312 /// and smaller values for common terms.
313 ///
314 /// </summary>
315 /// <param name="docFreq">the number of documents which contain the term
316 /// </param>
317 /// <param name="numDocs">the total number of documents in the collection
318 /// </param>
319 /// <returns> a score factor based on the term's document frequency
320 /// </returns>
321 public abstract float Ldf(int docFreq, int numDocs);
323 /// <summary>Computes a score factor based on the fraction of all query terms that a
324 /// document contains. This value is multiplied into scores.
325 ///
326 /// <p>The presence of a large portion of the query terms indicates a better
327 /// match with the query, so implementations of this method usually return
328 /// larger values when the ratio between these parameters is large and smaller
329 /// values when the ratio between them is small.
330 ///
331 /// </summary>
332 /// <param name="overlap">the number of query terms matched in the document
333 /// </param>
334 /// <param name="maxOverlap">the total number of terms in the query
335 /// </param>
336 /// <returns> a score factor based on term overlap with the query
337 /// </returns>
338 public abstract float Coord(int overlap, int maxOverlap);
339 static Similarity()
342 for (int i = 0; i < 256; i++)
343 NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i);