Initial revision
[beagle.git] / Lucene.Net / Search / Similarity.cs
bloba792da23c7fa0b39358c4dde4359b96da47ec306
1 using System;
2 using System.Collections;
4 using Lucene.Net.Index;
5 using Lucene.Net.Documents;
7 namespace Lucene.Net.Search
9 /* ====================================================================
10 * The Apache Software License, Version 1.1
12 * Copyright (c) 2001 The Apache Software Foundation. All rights
13 * reserved.
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in
24 * the documentation and/or other materials provided with the
25 * distribution.
27 * 3. The end-user documentation included with the redistribution,
28 * if any, must include the following acknowledgment:
29 * "This product includes software developed by the
30 * Apache Software Foundation (http://www.apache.org/)."
31 * Alternately, this acknowledgment may appear in the software itself,
32 * if and wherever such third-party acknowledgments normally appear.
34 * 4. The names "Apache" and "Apache Software Foundation" and
35 * "Apache Lucene" must not be used to endorse or promote products
36 * derived from this software without prior written permission. For
37 * written permission, please contact apache@apache.org.
39 * 5. Products derived from this software may not be called "Apache",
40 * "Apache Lucene", nor may "Apache" appear in their name, without
41 * prior written permission of the Apache Software Foundation.
43 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
45 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
47 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 * ====================================================================
57 * This software consists of voluntary contributions made by many
58 * individuals on behalf of the Apache Software Foundation. For more
59 * information on the Apache Software Foundation, please see
60 * <http://www.apache.org/>.
63 /// <summary>
64 /// Expert: Scoring API.
65 /// <p>Subclasses implement search scoring.</p>
66 /// <p>The score of query <code>q</code> for document <code>d</code> is defined
67 /// in terms of these methods as follows:</p>
68 /// <table cellpadding="0" cellspacing="0" border="0">
69 /// <tr>
70 /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br/></td>
71 /// <td valign="middle" align="center">
72 /// <big><big><big><big><big>Sigma;</big></big></big></big></big></td>
73 /// <td valign="middle"><small>
74 /// Tf(int) tf}(t in d)
75 /// Idf(Term,Searcher) idf}(t)
76 /// Field.GetBoost GetBoost(t.field in d)
77 /// LengthNorm(String,int) lengthNorm(t.field in d)
78 /// </small></td>
79 /// <td valign="middle" rowspan="2">
80 /// Coord(int,int) Coord(q,d)
81 /// QueryNorm(float) QueryNorm(q)
82 /// </td>
83 /// </tr>
84 /// <tr>
85 /// <td valign="top" align="right">
86 /// <small>t in q</small>
87 /// </td>
88 /// </tr>
89 /// </table>
90 ///
91 /// <see cref="SetDefault(Similarity)"/>
92 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
93 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
94 /// </summary>
95 public abstract class Similarity
97 /// <summary>
98 /// The Similarity implementation used by default.
99 /// </summary>
100 private static Similarity defaultImpl = new DefaultSimilarity();
102 /// <summary>
103 /// Set the default Similarity implementation used by indexing and search
104 /// code.
105 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
106 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
107 /// </summary>
108 /// <param name="similarity"></param>
109 public static void SetDefault(Similarity similarity)
111 Similarity.defaultImpl = similarity;
114 /// <summary>
115 /// Return the default Similarity implementation used by indexing and search
116 /// code.
117 /// <p>This is initially an instance of DefaultSimilarity.</p>
118 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
119 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
120 /// </summary>
121 /// <returns></returns>
122 public static Similarity GetDefault()
124 return Similarity.defaultImpl;
127 /// <summary>
128 /// Cache of decoded bytes.
129 /// </summary>
130 private static readonly float[] NORM_TABLE = new float[256];
132 static Similarity()
134 for (int i = 0; i < 256; i++)
135 NORM_TABLE[i] = ByteToFloat((byte)i);
138 /// <summary>
139 /// Decodes a normalization factor stored in an index.
140 /// <see cref="EncodeNorm(float)"/>
141 /// </summary>
142 /// <param name="b"></param>
143 /// <returns></returns>
144 public static float DecodeNorm(byte b)
146 return NORM_TABLE[b & 0xFF];
149 /// <summary>
150 /// Computes the normalization value for a field given the total number of
151 /// terms contained in a field. These values, together with field boosts, are
152 /// stored in an index and multipled into scores for hits on each field by the
153 /// search code.
155 /// <p>Matches in longer fields are less precise, so implemenations of this
156 /// method usually return smaller values when <code>numTokens</code> is large,
157 /// and larger values when <code>numTokens</code> is small.</p>
158 ///
159 /// <p>That these values are computed under
160 /// IndexWriter.AddDocument(Document) and stored then using
161 /// EncodeNorm(float). Thus they have limited precision, and documents
162 /// must be re-indexed if this method is altered.
163 /// </p>
164 ///
165 /// <see cref="Field.SetBoost(float)"/>
166 /// </summary>
167 /// <param name="fieldName">the name of the field</param>
168 /// <param name="numTokens">
169 /// the total number of tokens contained in fields named
170 /// <i>fieldName</i> of <i>doc</i>.
171 /// </param>
172 /// <returns>a normalization factor for hits on this field of this document</returns>
173 public abstract float LengthNorm(String fieldName, int numTokens);
175 /// <summary>
176 /// Computes the normalization value for a query given the sum of the squared
177 /// weights of each of the query terms. This value is then multipled into the
178 /// weight of each query term.
180 /// <p>This does not affect ranking, but rather just attempts to make scores
181 /// from different queries comparable.
182 /// </p>
183 /// </summary>
184 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights</param>
185 /// <returns>a normalization factor for query weights</returns>
186 public abstract float QueryNorm(float sumOfSquaredWeights);
188 /// <summary>
189 /// Encodes a normalization factor for storage in an index.
191 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
192 /// representing values from around 7x10^9 to 2x10^-9 with about one
193 /// significant decimal digit of accuracy. Zero is also represented.
194 /// Negative numbers are rounded up to zero. Values too large to represent
195 /// are rounded down to the largest representable value. Positive values too
196 /// small to represent are rounded up to the smallest positive representable
197 /// value.
198 /// </p>
199 ///
200 /// <see cref="Field.SetBoost(float)"/>
201 /// </summary>
202 /// <param name="f"></param>
203 /// <returns></returns>
204 public static byte EncodeNorm(float f)
206 return FloatToByte(f);
209 private static float ByteToFloat(byte b)
211 if (b == 0) // zero is a special case
212 return 0.0f;
213 int mantissa = b & 7;
214 int exponent = (b >> 3) & 31;
215 int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
218 return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0);
221 private static byte FloatToByte(float f)
223 if (f < 0.0f) // round negatives up to zero
224 f = 0.0f;
226 if (f == 0.0f) // zero is a special case
227 return 0;
229 int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0); // parse float into parts
230 int mantissa = (bits & 0xffffff) >> 21;
231 int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
233 if (exponent > 31)
234 { // overflow: use max value
235 exponent = 31;
236 mantissa = 7;
239 if (exponent < 0)
240 { // underflow: use min value
241 exponent = 0;
242 mantissa = 1;
245 return (byte)((exponent << 3) | mantissa); // pack into a byte
248 /// <summary>
249 /// Computes a score factor based on a term or phrase's frequency in a
250 /// document. This value is multiplied by the Idf(Term, Searcher)
251 /// factor for each term in the query and these products are then summed to
252 /// form the initial score for a document.
254 /// <p>Terms and phrases repeated in a document indicate the topic of the
255 /// document, so implemenations of this method usually return larger values
256 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
257 /// is small.
258 /// </p>
260 /// <p>The default implementation calls Tf(float).</p>
261 /// </summary>
262 /// <param name="freq">the frequency of a term within a document</param>
263 /// <returns>a score factor based on a term's within-document frequency</returns>
264 virtual public float Tf(int freq)
266 return Tf((float)freq);
269 /// <summary>
270 /// Computes the amount of a sloppy phrase match, based on an edit distance.
271 /// This value is summed for each sloppy phrase match in a document to form
272 /// the frequency that is passed to Tf(float).
274 /// <p>A phrase match with a small edit distance to a document passage more
275 /// closely matches the document, so implemenations of this method usually
276 /// return larger values when the edit distance is small and smaller values
277 /// when it is large.
278 /// </p>
280 /// <see cref="PhraseQuery.SetSlop(int)"/>
281 /// </summary>
282 /// <param name="distance">the edit distance of this sloppy phrase match</param>
283 /// <returns>the frequency increment for this match</returns>
284 public abstract float SloppyFreq(int distance);
286 /// <summary>
287 /// Computes a score factor based on a term or phrase's frequency in a
288 /// document. This value is multiplied by the Idf(Term, Searcher)
289 /// factor for each term in the query and these products are then summed to
290 /// form the initial score for a document.
292 /// <p>Terms and phrases repeated in a document indicate the topic of the
293 /// document, so implemenations of this method usually return larger values
294 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
295 /// is small.
296 /// </p>
298 /// </summary>
299 /// <param name="freq">the frequency of a term within a document</param>
300 /// <returns>a score factor based on a term's within-document frequency</returns>
301 public abstract float Tf(float freq);
303 /// <summary>
304 /// Computes a score factor for a simple term.
306 /// <p>The default implementation is:
307 /// <pre>
308 /// return Idf(searcher.DocFreq(term), searcher.MaxDoc());
309 /// </pre>
311 /// Note that Searcher.MaxDoc() is used instead of
312 /// IndexReader.NumDocs() because it is proportional to
313 /// Searcher.DocFreq(Term), i.e., when one is inaccurate, so is the other,
314 /// and in the same direction.
315 /// </p>
317 /// </summary>
318 /// <param name="term">the term in question</param>
319 /// <param name="searcher">the document collection being searched</param>
320 /// <returns>a score factor for the term</returns>
321 virtual public float Idf(Term term, Searcher searcher)
323 return Idf(searcher.DocFreq(term), searcher.MaxDoc());
326 /// <summary>
327 /// Computes a score factor for a phrase.
328 /// <p>The default implementation sums the Idf(Term,Searcher) factor
329 /// for each term in the phrase.
330 /// </p>
331 /// </summary>
332 /// <param name="terms">the vector of terms in the phrase</param>
333 /// <param name="searcher">the document collection being searched</param>
334 /// <returns>a score factor for the phrase</returns>
335 virtual public float Idf(ArrayList terms, Searcher searcher)
337 float _idf = 0.0f;
338 for (int i = 0; i < terms.Count; i++)
340 _idf += Idf((Term)terms[i], searcher);
342 return _idf;
345 /// <summary>
346 /// Computes a score factor based on a term's document frequency (the number
347 /// of documents which contain the term). This value is multiplied by the
348 /// Tf(int) factor for each term in the query and these products are
349 /// then summed to form the initial score for a document.
351 /// <p>Terms that occur in fewer documents are better indicators of topic, so
352 /// implemenations of this method usually return larger values for rare terms,
353 /// and smaller values for common terms.
354 /// </p>
355 /// </summary>
356 /// <param name="docFreq">the number of documents which contain the term</param>
357 /// <param name="numDocs">the total number of documents in the collection</param>
358 /// <returns>a score factor based on the term's document frequency</returns>
359 public abstract float Idf(int docFreq, int numDocs);
361 /// <summary>
362 /// Computes a score factor based on the fraction of all query terms that a
363 /// document contains. This value is multiplied into scores.
365 /// <p>The presence of a large portion of the query terms indicates a better
366 /// match with the query, so implemenations of this method usually return
367 /// larger values when the ratio between these parameters is large and smaller
368 /// values when the ratio between them is small.
369 /// </p>
370 /// </summary>
371 /// <param name="overlap">the number of query terms matched in the document</param>
372 /// <param name="maxOverlap">the total number of terms in the query</param>
373 /// <returns>a score factor based on term overlap with the query</returns>
374 public abstract float Coord(int overlap, int maxOverlap);