2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / Lucene.Net / Search / Similarity.cs
blob9bfabecf293d025f867f0eef376985aa324326eb
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Field = Lucene.Net.Documents.Field;
18 using IndexReader = Lucene.Net.Index.IndexReader;
19 using IndexWriter = Lucene.Net.Index.IndexWriter;
20 using Term = Lucene.Net.Index.Term;
21 namespace Lucene.Net.Search
25 /// <summary>Expert: Scoring API.
26 /// <p>Subclasses implement search scoring.
27 ///
28 /// <p>The score of query <code>q</code> for document <code>d</code> is defined
29 /// in terms of these methods as follows:
30 ///
31 /// <table cellpadding="0" cellspacing="0" border="0">
32 /// <tr>
33 /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
34 /// <td valign="middle" align="center">
35 /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
36 /// <td valign="middle"><small>
37 /// ( {@link #Tf(int) tf}(t in d) *
38 /// {@link #Idf(Term,Searcher) idf}(t)^2 *
39 /// {@link Query#getBoost getBoost}(t in q) *
40 /// {@link Field#getBoost getBoost}(t.field in d) *
41 /// {@link #LengthNorm(String,int) lengthNorm}(t.field in d) )
42 /// </small></td>
43 /// <td valign="middle" rowspan="2">&nbsp;*
44 /// {@link #Coord(int,int) coord}(q,d) *
45 /// {@link #QueryNorm(float) queryNorm}(sumOfSqaredWeights)
46 /// </td>
47 /// </tr>
48 /// <tr>
49 /// <td valign="top" align="right">
50 /// <small>t in q</small>
51 /// </td>
52 /// </tr>
53 /// </table>
54 ///
55 /// <p> where
56 ///
57 /// <table cellpadding="0" cellspacing="0" border="0">
58 /// <tr>
59 /// <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td>
60 /// <td valign="middle" align="center">
61 /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
62 /// <td valign="middle"><small>
63 /// ( {@link #Idf(Term,Searcher) idf}(t) *
64 /// {@link Query#getBoost getBoost}(t in q) )^2
65 /// </small></td>
66 /// </tr>
67 /// <tr>
68 /// <td valign="top" align="right">
69 /// <small>t in q</small>
70 /// </td>
71 /// </tr>
72 /// </table>
73 ///
74 /// <p> Note that the above formula is motivated by the cosine-distance or dot-product
75 /// between document and query vector, which is implemented by {@link DefaultSimilarity}.
76 ///
77 /// </summary>
78 /// <seealso cref="#SetDefault(Similarity)">
79 /// </seealso>
80 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
81 /// </seealso>
82 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
83 /// </seealso>
84 [Serializable]
85 public abstract class Similarity
87 /// <summary>The Similarity implementation used by default. </summary>
88 private static Similarity defaultImpl = new DefaultSimilarity();
90 /// <summary>Set the default Similarity implementation used by indexing and search
91 /// code.
92 ///
93 /// </summary>
94 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
95 /// </seealso>
96 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
97 /// </seealso>
98 public static void SetDefault(Similarity similarity)
100 Similarity.defaultImpl = similarity;
103 /// <summary>Return the default Similarity implementation used by indexing and search
104 /// code.
105 ///
106 /// <p>This is initially an instance of {@link DefaultSimilarity}.
107 ///
108 /// </summary>
109 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
110 /// </seealso>
111 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
112 /// </seealso>
113 public static Similarity GetDefault()
115 return Similarity.defaultImpl;
118 /// <summary>Cache of decoded bytes. </summary>
119 private static readonly float[] NORM_TABLE = new float[256];
121 /// <summary>Decodes a normalization factor stored in an index.</summary>
122 /// <seealso cref="#EncodeNorm(float)">
123 /// </seealso>
124 public static float DecodeNorm(byte b)
126 return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
129 /// <summary>Returns a table for decoding normalization bytes.</summary>
130 /// <seealso cref="#EncodeNorm(float)">
131 /// </seealso>
132 public static float[] GetNormDecoder()
134 return NORM_TABLE;
137 /// <summary>Computes the normalization value for a field given the total number of
138 /// terms contained in a field. These values, together with field boosts, are
139 /// stored in an index and multipled into scores for hits on each field by the
140 /// search code.
141 ///
142 /// <p>Matches in longer fields are less precise, so implementations of this
143 /// method usually return smaller values when <code>numTokens</code> is large,
144 /// and larger values when <code>numTokens</code> is small.
145 ///
146 /// <p>That these values are computed under {@link
147 /// IndexWriter#AddDocument(Lucene.Net.document.Document)} and stored then using
148 /// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents
149 /// must be re-indexed if this method is altered.
150 ///
151 /// </summary>
152 /// <param name="fieldName">the name of the field
153 /// </param>
154 /// <param name="numTokens">the total number of tokens contained in fields named
155 /// <i>fieldName</i> of <i>doc</i>.
156 /// </param>
157 /// <returns> a normalization factor for hits on this field of this document
158 ///
159 /// </returns>
160 /// <seealso cref="Field#SetBoost(float)">
161 /// </seealso>
162 public abstract float LengthNorm(System.String fieldName, int numTokens);
164 /// <summary>Computes the normalization value for a query given the sum of the squared
165 /// weights of each of the query terms. This value is then multipled into the
166 /// weight of each query term.
167 ///
168 /// <p>This does not affect ranking, but rather just attempts to make scores
169 /// from different queries comparable.
170 ///
171 /// </summary>
172 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
173 /// </param>
174 /// <returns> a normalization factor for query weights
175 /// </returns>
176 public abstract float QueryNorm(float sumOfSquaredWeights);
178 /// <summary>Encodes a normalization factor for storage in an index.
179 ///
180 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
181 /// representing values from around 7x10^9 to 2x10^-9 with about one
182 /// significant decimal digit of accuracy. Zero is also represented.
183 /// Negative numbers are rounded up to zero. Values too large to represent
184 /// are rounded down to the largest representable value. Positive values too
185 /// small to represent are rounded up to the smallest positive representable
186 /// value.
187 ///
188 /// </summary>
189 /// <seealso cref="Field#SetBoost(float)">
190 /// </seealso>
191 public static byte EncodeNorm(float f)
193 return FloatToByte(f);
196 private static float ByteToFloat(byte b)
198 if (b == 0)
199 // zero is a special case
200 return 0.0f;
201 int mantissa = b & 7;
202 int exponent = (b >> 3) & 31;
203 int bits = ((exponent + (63 - 15)) << 24) | (mantissa << 21);
204 return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0);
207 private static byte FloatToByte(float f)
209 if (f < 0.0f)
210 // round negatives up to zero
211 f = 0.0f;
213 if (f == 0.0f)
214 // zero is a special case
215 return 0;
217 int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0); // parse float into parts
218 int mantissa = (bits & 0xffffff) >> 21;
219 int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
221 if (exponent > 31)
223 // overflow: use max value
224 exponent = 31;
225 mantissa = 7;
228 if (exponent < 0)
230 // underflow: use min value
231 exponent = 0;
232 mantissa = 1;
235 return (byte) ((exponent << 3) | mantissa); // pack into a byte
239 /// <summary>Computes a score factor based on a term or phrase's frequency in a
240 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
241 /// factor for each term in the query and these products are then summed to
242 /// form the initial score for a document.
243 ///
244 /// <p>Terms and phrases repeated in a document indicate the topic of the
245 /// document, so implementations of this method usually return larger values
246 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
247 /// is small.
248 ///
249 /// <p>The default implementation calls {@link #Tf(float)}.
250 ///
251 /// </summary>
252 /// <param name="freq">the frequency of a term within a document
253 /// </param>
254 /// <returns> a score factor based on a term's within-document frequency
255 /// </returns>
256 public virtual float Tf(int freq)
258 return Tf((float) freq);
261 /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
262 /// This value is summed for each sloppy phrase match in a document to form
263 /// the frequency that is passed to {@link #Tf(float)}.
264 ///
265 /// <p>A phrase match with a small edit distance to a document passage more
266 /// closely matches the document, so implementations of this method usually
267 /// return larger values when the edit distance is small and smaller values
268 /// when it is large.
269 ///
270 /// </summary>
271 /// <seealso cref="PhraseQuery#SetSlop(int)">
272 /// </seealso>
273 /// <param name="distance">the edit distance of this sloppy phrase match
274 /// </param>
275 /// <returns> the frequency increment for this match
276 /// </returns>
277 public abstract float SloppyFreq(int distance);
279 /// <summary>Computes a score factor based on a term or phrase's frequency in a
280 /// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
281 /// factor for each term in the query and these products are then summed to
282 /// form the initial score for a document.
283 ///
284 /// <p>Terms and phrases repeated in a document indicate the topic of the
285 /// document, so implementations of this method usually return larger values
286 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
287 /// is small.
288 ///
289 /// </summary>
290 /// <param name="freq">the frequency of a term within a document
291 /// </param>
292 /// <returns> a score factor based on a term's within-document frequency
293 /// </returns>
294 public abstract float Tf(float freq);
296 /// <summary>Computes a score factor for a simple term.
297 ///
298 /// <p>The default implementation is:<pre>
299 /// return idf(searcher.docFreq(term), searcher.maxDoc());
300 /// </pre>
301 ///
302 /// Note that {@link Searcher#MaxDoc()} is used instead of
303 /// {@link IndexReader#NumDocs()} because it is proportional to
304 /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
305 /// so is the other, and in the same direction.
306 ///
307 /// </summary>
308 /// <param name="term">the term in question
309 /// </param>
310 /// <param name="searcher">the document collection being searched
311 /// </param>
312 /// <returns> a score factor for the term
313 /// </returns>
314 public virtual float Idf(Term term, Searcher searcher)
316 return Idf(searcher.DocFreq(term), searcher.MaxDoc());
319 /// <summary>Computes a score factor for a phrase.
320 ///
321 /// <p>The default implementation sums the {@link #Idf(Term,Searcher)} factor
322 /// for each term in the phrase.
323 ///
324 /// </summary>
325 /// <param name="terms">the terms in the phrase
326 /// </param>
327 /// <param name="searcher">the document collection being searched
328 /// </param>
329 /// <returns> a score factor for the phrase
330 /// </returns>
331 public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
333 float idf = 0.0f;
334 System.Collections.IEnumerator i = terms.GetEnumerator();
335 while (i.MoveNext())
337 idf += Idf((Term) i.Current, searcher);
339 return idf;
342 /// <summary>Computes a score factor based on a term's document frequency (the number
343 /// of documents which contain the term). This value is multiplied by the
344 /// {@link #Tf(int)} factor for each term in the query and these products are
345 /// then summed to form the initial score for a document.
346 ///
347 /// <p>Terms that occur in fewer documents are better indicators of topic, so
348 /// implementations of this method usually return larger values for rare terms,
349 /// and smaller values for common terms.
350 ///
351 /// </summary>
352 /// <param name="docFreq">the number of documents which contain the term
353 /// </param>
354 /// <param name="numDocs">the total number of documents in the collection
355 /// </param>
356 /// <returns> a score factor based on the term's document frequency
357 /// </returns>
358 public abstract float Idf(int docFreq, int numDocs);
360 /// <summary>Computes a score factor based on the fraction of all query terms that a
361 /// document contains. This value is multiplied into scores.
362 ///
363 /// <p>The presence of a large portion of the query terms indicates a better
364 /// match with the query, so implementations of this method usually return
365 /// larger values when the ratio between these parameters is large and smaller
366 /// values when the ratio between them is small.
367 ///
368 /// </summary>
369 /// <param name="overlap">the number of query terms matched in the document
370 /// </param>
371 /// <param name="maxOverlap">the total number of terms in the query
372 /// </param>
373 /// <returns> a score factor based on term overlap with the query
374 /// </returns>
375 public abstract float Coord(int overlap, int maxOverlap);
376 static Similarity()
379 for (int i = 0; i < 256; i++)
380 NORM_TABLE[i] = ByteToFloat((byte) i);