Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Search / Similarity.cs
blobf0388c2164539e57ce69e85aa3c8c8856ccdd695
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Field = Lucene.Net.Documents.Field;
18 using IndexReader = Lucene.Net.Index.IndexReader;
19 using IndexWriter = Lucene.Net.Index.IndexWriter;
20 using Term = Lucene.Net.Index.Term;
21 namespace Lucene.Net.Search
25 /// <summary>Expert: Scoring API.
26 /// <p>Subclasses implement search scoring.
27 ///
28 /// <p>The score of query <code>q</code> for document <code>d</code> is defined
29 /// in terms of these methods as follows:
30 ///
31 /// <table cellpadding="0" cellspacing="0" border="0">
32 /// <tr>
33 /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
34 /// <td valign="middle" align="center">
35 /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
36 /// <td valign="middle"><small>
37 /// {@link #Tf(int) tf}(t in d) *
38 /// {@link #Idf(Term,Searcher) idf}(t) *
39 /// {@link Field#getBoost getBoost}(t.Field in d) *
40 /// {@link #LengthNorm(String,int) lengthNorm}(t.Field in d)
41 /// </small></td>
42 /// <td valign="middle" rowspan="2">&nbsp;*
43 /// {@link #Coord(int,int) coord}(q,d) *
44 /// {@link #QueryNorm(float) queryNorm}(q)
45 /// </td>
46 /// </tr>
47 /// <tr>
48 /// <td valign="top" align="right">
49 /// <small>t in q</small>
50 /// </td>
51 /// </tr>
52 /// </table>
53 ///
54 /// </summary>
55 /// <seealso cref="#SetDefault(Similarity)">
56 /// </seealso>
57 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
58 /// </seealso>
59 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
60 /// </seealso>
61 public abstract class Similarity
63 /// <summary>The Similarity implementation used by default. </summary>
64 private static Similarity defaultImpl = new DefaultSimilarity();
66 /// <summary>Set the default Similarity implementation used by indexing and search
67 /// code.
68 ///
69 /// </summary>
70 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
71 /// </seealso>
72 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
73 /// </seealso>
74 public static void SetDefault(Similarity similarity)
76 Similarity.defaultImpl = similarity;
79 /// <summary>Return the default Similarity implementation used by indexing and search
80 /// code.
81 ///
82 /// <p>This is initially an instance of {@link DefaultSimilarity}.
83 ///
84 /// </summary>
85 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
86 /// </seealso>
87 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
88 /// </seealso>
89 public static Similarity GetDefault()
91 return Similarity.defaultImpl;
94 /// <summary>Cache of decoded bytes. </summary>
95 private static readonly float[] NORM_TABLE = new float[256];
97 /// <summary>Decodes a normalization factor stored in an index.</summary>
98 /// <seealso cref="#EncodeNorm(float)">
99 /// </seealso>
100 public static float DecodeNorm(byte b)
102 return NORM_TABLE[b & 0xFF];
105 /// <summary>Computes the normalization value for a Field given the total number of
106 /// terms contained in a Field. These values, together with Field boosts, are
107 /// stored in an index and multipled into scores for hits on each Field by the
108 /// search code.
109 ///
110 /// <p>Matches in longer fields are less precise, so implemenations of this
111 /// method usually return smaller values when <code>numTokens</code> is large,
112 /// and larger values when <code>numTokens</code> is small.
113 ///
114 /// <p>That these values are computed under {@link
115 /// IndexWriter#AddDocument(Document)} and stored then using
116 /// {#encodeNorm(float)}. Thus they have limited precision, and documents
117 /// must be re-indexed if this method is altered.
118 ///
119 /// </summary>
120 /// <param name="fieldName">the name of the Field
121 /// </param>
122 /// <param name="numTokens">the total number of tokens contained in fields named
123 /// <i>fieldName</i> of <i>doc</i>.
124 /// </param>
125 /// <returns> a normalization factor for hits on this Field of this document
126 ///
127 /// </returns>
128 /// <seealso cref="Field#SetBoost(float)">
129 /// </seealso>
130 public abstract float LengthNorm(System.String fieldName, int numTokens);
132 /// <summary>Computes the normalization value for a query given the sum of the squared
133 /// weights of each of the query terms. This value is then multipled into the
134 /// weight of each query term.
135 ///
136 /// <p>This does not affect ranking, but rather just attempts to make scores
137 /// from different queries comparable.
138 ///
139 /// </summary>
140 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
141 /// </param>
142 /// <returns> a normalization factor for query weights
143 /// </returns>
144 public abstract float QueryNorm(float sumOfSquaredWeights);
146 /// <summary>Encodes a normalization factor for storage in an index.
147 ///
148 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
149 /// representing values from around 7x10^9 to 2x10^-9 with about one
150 /// significant decimal digit of accuracy. Zero is also represented.
151 /// Negative numbers are rounded up to zero. Values too large to represent
152 /// are rounded down to the largest representable value. Positive values too
153 /// small to represent are rounded up to the smallest positive representable
154 /// value.
155 ///
156 /// </summary>
157 /// <seealso cref="Field#SetBoost(float)">
158 /// </seealso>
159 public static byte EncodeNorm(float f)
161 return FloatToByte(f);
164 private static float ByteToFloat(byte b)
166 if (b == 0)
167 // zero is a special case
168 return 0.0f;
169 int mantissa = b & 7;
170 int exponent = (b >> 3) & 31;
171 int bits = ((exponent + (63 - 15)) << 24) | (mantissa << 21);
172 return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0);
175 private static byte FloatToByte(float f)
177 if (f < 0.0f)
178 // round negatives up to zero
179 f = 0.0f;
181 if (f == 0.0f)
182 // zero is a special case
183 return 0;
185 int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0); // parse float into parts
186 int mantissa = (bits & 0xffffff) >> 21;
187 int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
189 if (exponent > 31)
191 // overflow: use max value
192 exponent = 31;
193 mantissa = 7;
196 if (exponent < 0)
198 // underflow: use min value
199 exponent = 0;
200 mantissa = 1;
203 return (byte) ((exponent << 3) | mantissa); // pack into a byte
207 /// <summary>Computes a score factor based on a term or phrase's frequency in a
208 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
209 /// factor for each term in the query and these products are then summed to
210 /// form the initial score for a document.
211 ///
212 /// <p>Terms and phrases repeated in a document indicate the topic of the
213 /// document, so implementations of this method usually return larger values
214 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
215 /// is small.
216 ///
217 /// <p>The default implementation calls {@link #Tf(float)}.
218 ///
219 /// </summary>
220 /// <param name="freq">the frequency of a term within a document
221 /// </param>
222 /// <returns> a score factor based on a term's within-document frequency
223 /// </returns>
224 public virtual float Tf(int freq)
226 return Tf((float) freq);
229 /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
230 /// This value is summed for each sloppy phrase match in a document to form
231 /// the frequency that is passed to {@link #Tf(float)}.
232 ///
233 /// <p>A phrase match with a small edit distance to a document passage more
234 /// closely matches the document, so implementations of this method usually
235 /// return larger values when the edit distance is small and smaller values
236 /// when it is large.
237 ///
238 /// </summary>
239 /// <seealso cref="PhraseQuery#SetSlop(int)">
240 /// </seealso>
241 /// <param name="distance">the edit distance of this sloppy phrase match
242 /// </param>
243 /// <returns> the frequency increment for this match
244 /// </returns>
245 public abstract float SloppyFreq(int distance);
247 /// <summary>Computes a score factor based on a term or phrase's frequency in a
248 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
249 /// factor for each term in the query and these products are then summed to
250 /// form the initial score for a document.
251 ///
252 /// <p>Terms and phrases repeated in a document indicate the topic of the
253 /// document, so implemenations of this method usually return larger values
254 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
255 /// is small.
256 ///
257 /// </summary>
258 /// <param name="freq">the frequency of a term within a document
259 /// </param>
260 /// <returns> a score factor based on a term's within-document frequency
261 /// </returns>
262 public abstract float Tf(float freq);
264 /// <summary>Computes a score factor for a simple term.
265 ///
266 /// <p>The default implementation is:<pre>
267 /// return idf(searcher.docFreq(term), searcher.maxDoc());
268 /// </pre>
269 ///
270 /// Note that {@link Searcher#MaxDoc()} is used instead of
271 /// {@link IndexReader#NumDocs()} because it is proportional to
272 /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
273 /// so is the other, and in the same direction.
274 ///
275 /// </summary>
276 /// <param name="term">the term in question
277 /// </param>
278 /// <param name="searcher">the document collection being searched
279 /// </param>
280 /// <returns> a score factor for the term
281 /// </returns>
282 public virtual float Idf(Term term, Searcher searcher)
284 return Idf(searcher.DocFreq(term), searcher.MaxDoc());
287 /// <summary>Computes a score factor for a phrase.
288 ///
289 /// <p>The default implementation sums the {@link #Idf(Term,Searcher)} factor
290 /// for each term in the phrase.
291 ///
292 /// </summary>
293 /// <param name="terms">the terms in the phrase
294 /// </param>
295 /// <param name="searcher">the document collection being searched
296 /// </param>
297 /// <returns> a score factor for the phrase
298 /// </returns>
299 public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
301 float idf = 0.0f;
302 System.Collections.IEnumerator i = terms.GetEnumerator();
303 while (i.MoveNext())
305 idf += Idf((Term) i.Current, searcher);
307 return idf;
310 /// <summary>Computes a score factor based on a term's document frequency (the number
311 /// of documents which contain the term). This value is multiplied by the
312 /// {@link #Tf(int)} factor for each term in the query and these products are
313 /// then summed to form the initial score for a document.
314 ///
315 /// <p>Terms that occur in fewer documents are better indicators of topic, so
316 /// implemenations of this method usually return larger values for rare terms,
317 /// and smaller values for common terms.
318 ///
319 /// </summary>
320 /// <param name="docFreq">the number of documents which contain the term
321 /// </param>
322 /// <param name="numDocs">the total number of documents in the collection
323 /// </param>
324 /// <returns> a score factor based on the term's document frequency
325 /// </returns>
326 public abstract float Idf(int docFreq, int numDocs);
328 /// <summary>Computes a score factor based on the fraction of all query terms that a
329 /// document contains. This value is multiplied into scores.
330 ///
331 /// <p>The presence of a large portion of the query terms indicates a better
332 /// match with the query, so implemenations of this method usually return
333 /// larger values when the ratio between these parameters is large and smaller
334 /// values when the ratio between them is small.
335 ///
336 /// </summary>
337 /// <param name="overlap">the number of query terms matched in the document
338 /// </param>
339 /// <param name="maxOverlap">the total number of terms in the query
340 /// </param>
341 /// <returns> a score factor based on term overlap with the query
342 /// </returns>
343 public abstract float Coord(int overlap, int maxOverlap);
344 static Similarity()
347 for (int i = 0; i < 256; i++)
348 NORM_TABLE[i] = ByteToFloat((byte) i);