2 using System
.Collections
;
4 using Lucene
.Net
.Index
;
5 using Lucene
.Net
.Documents
;
7 namespace Lucene
.Net
.Search
9 /* ====================================================================
10 * The Apache Software License, Version 1.1
12 * Copyright (c) 2001 The Apache Software Foundation. All rights
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in
24 * the documentation and/or other materials provided with the
27 * 3. The end-user documentation included with the redistribution,
28 * if any, must include the following acknowledgment:
29 * "This product includes software developed by the
30 * Apache Software Foundation (http://www.apache.org/)."
31 * Alternately, this acknowledgment may appear in the software itself,
32 * if and wherever such third-party acknowledgments normally appear.
34 * 4. The names "Apache" and "Apache Software Foundation" and
35 * "Apache Lucene" must not be used to endorse or promote products
36 * derived from this software without prior written permission. For
37 * written permission, please contact apache@apache.org.
39 * 5. Products derived from this software may not be called "Apache",
40 * "Apache Lucene", nor may "Apache" appear in their name, without
41 * prior written permission of the Apache Software Foundation.
43 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
45 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
47 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * ====================================================================
57 * This software consists of voluntary contributions made by many
58 * individuals on behalf of the Apache Software Foundation. For more
59 * information on the Apache Software Foundation, please see
60 * <http://www.apache.org/>.
64 /// Expert: Scoring API.
65 /// <p>Subclasses implement search scoring.</p>
66 /// <p>The score of query <code>q</code> for document <code>d</code> is defined
67 /// in terms of these methods as follows:</p>
68 /// <table cellpadding="0" cellspacing="0" border="0">
70 /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br/></td>
71 /// <td valign="middle" align="center">
72 /// <big><big><big><big><big>Sigma;</big></big></big></big></big></td>
73 /// <td valign="middle"><small>
74 /// Tf(int) tf}(t in d)
75 /// Idf(Term,Searcher) idf}(t)
76 /// Field.GetBoost GetBoost(t.field in d)
77 /// LengthNorm(String,int) lengthNorm(t.field in d)
79 /// <td valign="middle" rowspan="2">
80 /// Coord(int,int) Coord(q,d)
81 /// QueryNorm(float) QueryNorm(q)
85 /// <td valign="top" align="right">
86 /// <small>t in q</small>
91 /// <see cref="SetDefault(Similarity)"/>
92 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
93 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
95 public abstract class Similarity
98 /// The Similarity implementation used by default.
100 private static Similarity defaultImpl
= new DefaultSimilarity();
103 /// Set the default Similarity implementation used by indexing and search
105 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
106 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
108 /// <param name="similarity"></param>
109 public static void SetDefault(Similarity similarity
)
111 Similarity
.defaultImpl
= similarity
;
115 /// Return the default Similarity implementation used by indexing and search
117 /// <p>This is initially an instance of DefaultSimilarity.</p>
118 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
119 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
121 /// <returns></returns>
122 public static Similarity
GetDefault()
124 return Similarity
.defaultImpl
;
128 /// Cache of decoded bytes.
130 private static readonly float[] NORM_TABLE
= new float[256];
134 for (int i
= 0; i
< 256; i
++)
135 NORM_TABLE
[i
] = ByteToFloat((byte)i
);
139 /// Decodes a normalization factor stored in an index.
140 /// <see cref="EncodeNorm(float)"/>
142 /// <param name="b"></param>
143 /// <returns></returns>
144 public static float DecodeNorm(byte b
)
146 return NORM_TABLE
[b
& 0xFF];
150 /// Computes the normalization value for a field given the total number of
151 /// terms contained in a field. These values, together with field boosts, are
152 /// stored in an index and multipled into scores for hits on each field by the
155 /// <p>Matches in longer fields are less precise, so implemenations of this
156 /// method usually return smaller values when <code>numTokens</code> is large,
157 /// and larger values when <code>numTokens</code> is small.</p>
159 /// <p>That these values are computed under
160 /// IndexWriter.AddDocument(Document) and stored then using
161 /// EncodeNorm(float). Thus they have limited precision, and documents
162 /// must be re-indexed if this method is altered.
165 /// <see cref="Field.SetBoost(float)"/>
167 /// <param name="fieldName">the name of the field</param>
168 /// <param name="numTokens">
169 /// the total number of tokens contained in fields named
170 /// <i>fieldName</i> of <i>doc</i>.
172 /// <returns>a normalization factor for hits on this field of this document</returns>
173 public abstract float LengthNorm(String fieldName
, int numTokens
);
176 /// Computes the normalization value for a query given the sum of the squared
177 /// weights of each of the query terms. This value is then multipled into the
178 /// weight of each query term.
180 /// <p>This does not affect ranking, but rather just attempts to make scores
181 /// from different queries comparable.
184 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights</param>
185 /// <returns>a normalization factor for query weights</returns>
186 public abstract float QueryNorm(float sumOfSquaredWeights
);
189 /// Encodes a normalization factor for storage in an index.
191 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
192 /// representing values from around 7x10^9 to 2x10^-9 with about one
193 /// significant decimal digit of accuracy. Zero is also represented.
194 /// Negative numbers are rounded up to zero. Values too large to represent
195 /// are rounded down to the largest representable value. Positive values too
196 /// small to represent are rounded up to the smallest positive representable
200 /// <see cref="Field.SetBoost(float)"/>
202 /// <param name="f"></param>
203 /// <returns></returns>
204 public static byte EncodeNorm(float f
)
206 return FloatToByte(f
);
209 private static float ByteToFloat(byte b
)
211 if (b
== 0) // zero is a special case
213 int mantissa
= b
& 7;
214 int exponent
= (b
>> 3) & 31;
215 int bits
= ((exponent
+(63-15)) << 24) | (mantissa
<< 21);
218 return BitConverter
.ToSingle(BitConverter
.GetBytes(bits
), 0);
221 private static byte FloatToByte(float f
)
223 if (f
< 0.0f
) // round negatives up to zero
226 if (f
== 0.0f
) // zero is a special case
229 int bits
= BitConverter
.ToInt32(BitConverter
.GetBytes(f
), 0); // parse float into parts
230 int mantissa
= (bits
& 0xffffff) >> 21;
231 int exponent
= (((bits
>> 24) & 0x7f) - 63) + 15;
234 { // overflow: use max value
240 { // underflow: use min value
245 return (byte)((exponent
<< 3) | mantissa
); // pack into a byte
249 /// Computes a score factor based on a term or phrase's frequency in a
250 /// document. This value is multiplied by the Idf(Term, Searcher)
251 /// factor for each term in the query and these products are then summed to
252 /// form the initial score for a document.
254 /// <p>Terms and phrases repeated in a document indicate the topic of the
255 /// document, so implemenations of this method usually return larger values
256 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
260 /// <p>The default implementation calls Tf(float).</p>
262 /// <param name="freq">the frequency of a term within a document</param>
263 /// <returns>a score factor based on a term's within-document frequency</returns>
264 virtual public float Tf(int freq
)
266 return Tf((float)freq
);
270 /// Computes the amount of a sloppy phrase match, based on an edit distance.
271 /// This value is summed for each sloppy phrase match in a document to form
272 /// the frequency that is passed to Tf(float).
274 /// <p>A phrase match with a small edit distance to a document passage more
275 /// closely matches the document, so implemenations of this method usually
276 /// return larger values when the edit distance is small and smaller values
277 /// when it is large.
280 /// <see cref="PhraseQuery.SetSlop(int)"/>
282 /// <param name="distance">the edit distance of this sloppy phrase match</param>
283 /// <returns>the frequency increment for this match</returns>
284 public abstract float SloppyFreq(int distance
);
287 /// Computes a score factor based on a term or phrase's frequency in a
288 /// document. This value is multiplied by the Idf(Term, Searcher)
289 /// factor for each term in the query and these products are then summed to
290 /// form the initial score for a document.
292 /// <p>Terms and phrases repeated in a document indicate the topic of the
293 /// document, so implemenations of this method usually return larger values
294 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
299 /// <param name="freq">the frequency of a term within a document</param>
300 /// <returns>a score factor based on a term's within-document frequency</returns>
301 public abstract float Tf(float freq
);
304 /// Computes a score factor for a simple term.
306 /// <p>The default implementation is:
308 /// return Idf(searcher.DocFreq(term), searcher.MaxDoc());
311 /// Note that Searcher.MaxDoc() is used instead of
312 /// IndexReader.NumDocs() because it is proportional to
313 /// Searcher.DocFreq(Term), i.e., when one is inaccurate, so is the other,
314 /// and in the same direction.
318 /// <param name="term">the term in question</param>
319 /// <param name="searcher">the document collection being searched</param>
320 /// <returns>a score factor for the term</returns>
321 virtual public float Idf(Term term
, Searcher searcher
)
323 return Idf(searcher
.DocFreq(term
), searcher
.MaxDoc());
327 /// Computes a score factor for a phrase.
328 /// <p>The default implementation sums the Idf(Term,Searcher) factor
329 /// for each term in the phrase.
332 /// <param name="terms">the vector of terms in the phrase</param>
333 /// <param name="searcher">the document collection being searched</param>
334 /// <returns>a score factor for the phrase</returns>
335 virtual public float Idf(ArrayList terms
, Searcher searcher
)
338 for (int i
= 0; i
< terms
.Count
; i
++)
340 _idf
+= Idf((Term
)terms
[i
], searcher
);
346 /// Computes a score factor based on a term's document frequency (the number
347 /// of documents which contain the term). This value is multiplied by the
348 /// Tf(int) factor for each term in the query and these products are
349 /// then summed to form the initial score for a document.
351 /// <p>Terms that occur in fewer documents are better indicators of topic, so
352 /// implemenations of this method usually return larger values for rare terms,
353 /// and smaller values for common terms.
356 /// <param name="docFreq">the number of documents which contain the term</param>
357 /// <param name="numDocs">the total number of documents in the collection</param>
358 /// <returns>a score factor based on the term's document frequency</returns>
359 public abstract float Idf(int docFreq
, int numDocs
);
362 /// Computes a score factor based on the fraction of all query terms that a
363 /// document contains. This value is multiplied into scores.
365 /// <p>The presence of a large portion of the query terms indicates a better
366 /// match with the query, so implemenations of this method usually return
367 /// larger values when the ratio between these parameters is large and smaller
368 /// values when the ratio between them is small.
371 /// <param name="overlap">the number of query terms matched in the document</param>
372 /// <param name="maxOverlap">the total number of terms in the query</param>
373 /// <returns>a score factor based on term overlap with the query</returns>
374 public abstract float Coord(int overlap
, int maxOverlap
);