beagled/Lucene.Net/Search/Similarity.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 using System;
  18 using Field = Lucene.Net.Documents.Field;
  19 using IndexReader = Lucene.Net.Index.IndexReader;
  20 using IndexWriter = Lucene.Net.Index.IndexWriter;
  21 using Term = Lucene.Net.Index.Term;
  22 using SmallFloat = Lucene.Net.Util.SmallFloat;
  23
  24 namespace Lucene.Net.Search
  25 {
  26
  27         /// <summary>Expert: Scoring API.
  28         /// <p>Subclasses implement search scoring.
  29         ///
  30         /// <p>The score of query <code>q</code> for document <code>d</code> is defined
  31         /// in terms of these methods as follows:
  32         ///
  33         /// <table cellpadding="0" cellspacing="0" border="0">
  34         /// <tr>
  35         /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
  36         /// <td valign="middle" align="center">
  37         /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
  38         /// <td valign="middle"><small>
  39         /// ( {@link #Tf(int) tf}(t in d) *
  40         /// {@link #Idf(Term,Searcher) idf}(t)^2 *
  41         /// {@link Query#getBoost getBoost}(t in q) *
  42         /// {@link Field#getBoost getBoost}(t.field in d) *
  43         /// {@link #LengthNorm(String,int) lengthNorm}(t.field in d) )
  44         /// </small></td>
  45         /// <td valign="middle" rowspan="2">&nbsp;*
  46         /// {@link #Coord(int,int) coord}(q,d) *
  47         /// {@link #QueryNorm(float) queryNorm}(sumOfSqaredWeights)
  48         /// </td>
  49         /// </tr>
  50         /// <tr>
  51         /// <td valign="top" align="right">
  52         /// <small>t in q</small>
  53         /// </td>
  54         /// </tr>
  55         /// </table>
  56         ///
  57         /// <p> where
  58         ///
  59         /// <table cellpadding="0" cellspacing="0" border="0">
  60         /// <tr>
  61         /// <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td>
  62         /// <td valign="middle" align="center">
  63         /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
  64         /// <td valign="middle"><small>
  65         /// ( {@link #Idf(Term,Searcher) idf}(t) *
  66         /// {@link Query#getBoost getBoost}(t in q) )^2
  67         /// </small></td>
  68         /// </tr>
  69         /// <tr>
  70         /// <td valign="top" align="right">
  71         /// <small>t in q</small>
  72         /// </td>
  73         /// </tr>
  74         /// </table>
  75         ///
  76         /// <p> Note that the above formula is motivated by the cosine-distance or dot-product
  77         /// between document and query vector, which is implemented by {@link DefaultSimilarity}.
  78         ///
  79         /// </summary>
  80         /// <seealso cref="SetDefault(Similarity)">
  81         /// </seealso>
  82         /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
  83         /// </seealso>
  84         /// <seealso cref="Searcher.SetSimilarity(Similarity)">
  85         /// </seealso>
  86         [Serializable]
  87         public abstract class Similarity
  88         {
  89                 /// <summary>The Similarity implementation used by default. </summary>
  90                 private static Similarity defaultImpl = new DefaultSimilarity();
  91
  92                 /// <summary>Set the default Similarity implementation used by indexing and search
  93                 /// code.
  94                 ///
  95                 /// </summary>
  96                 /// <seealso cref="Searcher.SetSimilarity(Similarity)">
  97                 /// </seealso>
  98                 /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
  99                 /// </seealso>
 100                 public static void  SetDefault(Similarity similarity)
 101                 {
 102                         Similarity.defaultImpl = similarity;
 103                 }
 104
 105                 /// <summary>Return the default Similarity implementation used by indexing and search
 106                 /// code.
 107                 ///
 108                 /// <p>This is initially an instance of {@link DefaultSimilarity}.
 109                 ///
 110                 /// </summary>
 111                 /// <seealso cref="Searcher.SetSimilarity(Similarity)">
 112                 /// </seealso>
 113                 /// <seealso cref="IndexWriter.SetSimilarity(Similarity)">
 114                 /// </seealso>
 115                 public static Similarity GetDefault()
 116                 {
 117                         return Similarity.defaultImpl;
 118                 }
 119
 120                 /// <summary>Cache of decoded bytes. </summary>
 121                 private static readonly float[] NORM_TABLE = new float[256];
 122
 123                 /// <summary>Decodes a normalization factor stored in an index.</summary>
 124                 /// <seealso cref="EncodeNorm(float)">
 125                 /// </seealso>
 126                 public static float DecodeNorm(byte b)
 127                 {
 128                         return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
 129                 }
 130
 131                 /// <summary>Returns a table for decoding normalization bytes.</summary>
 132                 /// <seealso cref="EncodeNorm(float)">
 133                 /// </seealso>
 134                 public static float[] GetNormDecoder()
 135                 {
 136                         return NORM_TABLE;
 137                 }
 138
 139                 /// <summary>Computes the normalization value for a field given the total number of
 140                 /// terms contained in a field.  These values, together with field boosts, are
 141                 /// stored in an index and multipled into scores for hits on each field by the
 142                 /// search code.
 143                 ///
 144                 /// <p>Matches in longer fields are less precise, so implementations of this
 145                 /// method usually return smaller values when <code>numTokens</code> is large,
 146                 /// and larger values when <code>numTokens</code> is small.
 147                 ///
 148                 /// <p>That these values are computed under {@link
 149                 /// IndexWriter#AddDocument(Lucene.Net.document.Document)} and stored then using
 150                 /// {@link #EncodeNorm(float)}.  Thus they have limited precision, and documents
 151                 /// must be re-indexed if this method is altered.
 152                 ///
 153                 /// </summary>
 154                 /// <param name="fieldName">the name of the field
 155                 /// </param>
 156                 /// <param name="numTokens">the total number of tokens contained in fields named
 157                 /// <i>fieldName</i> of <i>doc</i>.
 158                 /// </param>
 159                 /// <returns> a normalization factor for hits on this field of this document
 160                 ///
 161                 /// </returns>
 162                 /// <seealso cref="Field.SetBoost(float)">
 163                 /// </seealso>
 164                 public abstract float LengthNorm(System.String fieldName, int numTokens);
 165
 166                 /// <summary>Computes the normalization value for a query given the sum of the squared
 167                 /// weights of each of the query terms.  This value is then multipled into the
 168                 /// weight of each query term.
 169                 ///
 170                 /// <p>This does not affect ranking, but rather just attempts to make scores
 171                 /// from different queries comparable.
 172                 ///
 173                 /// </summary>
 174                 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
 175                 /// </param>
 176                 /// <returns> a normalization factor for query weights
 177                 /// </returns>
 178                 public abstract float QueryNorm(float sumOfSquaredWeights);
 179
 180                 /// <summary>Encodes a normalization factor for storage in an index.
 181                 ///
 182                 /// <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
 183                 /// the zero-exponent point at 15, thus
 184                 /// representing values from around 7x10^9 to 2x10^-9 with about one
 185                 /// significant decimal digit of accuracy.  Zero is also represented.
 186                 /// Negative numbers are rounded up to zero.  Values too large to represent
 187                 /// are rounded down to the largest representable value.  Positive values too
 188                 /// small to represent are rounded up to the smallest positive representable
 189                 /// value.
 190                 ///
 191                 /// </summary>
 192                 /// <seealso cref="Field.SetBoost(float)">
 193                 /// </seealso>
 194                 /// <seealso cref="SmallFloat">
 195                 /// </seealso>
 196                 public static byte EncodeNorm(float f)
 197                 {
 198                         return (byte) SmallFloat.FloatToByte315(f);
 199                 }
 200
 201
 202                 /// <summary>Computes a score factor based on a term or phrase's frequency in a
 203                 /// document.  This value is multiplied by the {@link #Idf(Term, Searcher)}
 204                 /// factor for each term in the query and these products are then summed to
 205                 /// form the initial score for a document.
 206                 ///
 207                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 208                 /// document, so implementations of this method usually return larger values
 209                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 210                 /// is small.
 211                 ///
 212                 /// <p>The default implementation calls {@link #Tf(float)}.
 213                 ///
 214                 /// </summary>
 215                 /// <param name="freq">the frequency of a term within a document
 216                 /// </param>
 217                 /// <returns> a score factor based on a term's within-document frequency
 218                 /// </returns>
 219                 public virtual float Tf(int freq)
 220                 {
 221                         return Tf((float) freq);
 222                 }
 223
 224                 /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
 225                 /// This value is summed for each sloppy phrase match in a document to form
 226                 /// the frequency that is passed to {@link #Tf(float)}.
 227                 ///
 228                 /// <p>A phrase match with a small edit distance to a document passage more
 229                 /// closely matches the document, so implementations of this method usually
 230                 /// return larger values when the edit distance is small and smaller values
 231                 /// when it is large.
 232                 ///
 233                 /// </summary>
 234                 /// <seealso cref="PhraseQuery.SetSlop(int)">
 235                 /// </seealso>
 236                 /// <param name="distance">the edit distance of this sloppy phrase match
 237                 /// </param>
 238                 /// <returns> the frequency increment for this match
 239                 /// </returns>
 240                 public abstract float SloppyFreq(int distance);
 241
 242                 /// <summary>Computes a score factor based on a term or phrase's frequency in a
 243                 /// document.  This value is multiplied by the {@link #Idf(Term, Searcher)}
 244                 /// factor for each term in the query and these products are then summed to
 245                 /// form the initial score for a document.
 246                 ///
 247                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 248                 /// document, so implementations of this method usually return larger values
 249                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 250                 /// is small.
 251                 ///
 252                 /// </summary>
 253                 /// <param name="freq">the frequency of a term within a document
 254                 /// </param>
 255                 /// <returns> a score factor based on a term's within-document frequency
 256                 /// </returns>
 257                 public abstract float Tf(float freq);
 258
 259                 /// <summary>Computes a score factor for a simple term.
 260                 ///
 261                 /// <p>The default implementation is:<pre>
 262                 /// return idf(searcher.docFreq(term), searcher.maxDoc());
 263                 /// </pre>
 264                 ///
 265                 /// Note that {@link Searcher#MaxDoc()} is used instead of
 266                 /// {@link IndexReader#NumDocs()} because it is proportional to
 267                 /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
 268                 /// so is the other, and in the same direction.
 269                 ///
 270                 /// </summary>
 271                 /// <param name="term">the term in question
 272                 /// </param>
 273                 /// <param name="searcher">the document collection being searched
 274                 /// </param>
 275                 /// <returns> a score factor for the term
 276                 /// </returns>
 277                 public virtual float Idf(Term term, Searcher searcher)
 278                 {
 279                         return Ldf(searcher.DocFreq(term), searcher.MaxDoc());
 280                 }
 281
 282                 /// <summary>Computes a score factor for a phrase.
 283                 ///
 284                 /// <p>The default implementation sums the {@link #Idf(Term,Searcher)} factor
 285                 /// for each term in the phrase.
 286                 ///
 287                 /// </summary>
 288                 /// <param name="terms">the terms in the phrase
 289                 /// </param>
 290                 /// <param name="searcher">the document collection being searched
 291                 /// </param>
 292                 /// <returns> a score factor for the phrase
 293                 /// </returns>
 294                 public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
 295                 {
 296                         float idf = 0.0f;
 297                         System.Collections.IEnumerator i = terms.GetEnumerator();
 298                         while (i.MoveNext())
 299                         {
 300                                 idf += Idf((Term) i.Current, searcher);
 301                         }
 302                         return idf;
 303                 }
 304
 305                 /// <summary>Computes a score factor based on a term's document frequency (the number
 306                 /// of documents which contain the term).  This value is multiplied by the
 307                 /// {@link #Tf(int)} factor for each term in the query and these products are
 308                 /// then summed to form the initial score for a document.
 309                 ///
 310                 /// <p>Terms that occur in fewer documents are better indicators of topic, so
 311                 /// implementations of this method usually return larger values for rare terms,
 312                 /// and smaller values for common terms.
 313                 ///
 314                 /// </summary>
 315                 /// <param name="docFreq">the number of documents which contain the term
 316                 /// </param>
 317                 /// <param name="numDocs">the total number of documents in the collection
 318                 /// </param>
 319                 /// <returns> a score factor based on the term's document frequency
 320                 /// </returns>
 321                 public abstract float Ldf(int docFreq, int numDocs);
 322
 323                 /// <summary>Computes a score factor based on the fraction of all query terms that a
 324                 /// document contains.  This value is multiplied into scores.
 325                 ///
 326                 /// <p>The presence of a large portion of the query terms indicates a better
 327                 /// match with the query, so implementations of this method usually return
 328                 /// larger values when the ratio between these parameters is large and smaller
 329                 /// values when the ratio between them is small.
 330                 ///
 331                 /// </summary>
 332                 /// <param name="overlap">the number of query terms matched in the document
 333                 /// </param>
 334                 /// <param name="maxOverlap">the total number of terms in the query
 335                 /// </param>
 336                 /// <returns> a score factor based on term overlap with the query
 337                 /// </returns>
 338                 public abstract float Coord(int overlap, int maxOverlap);
 339                 static Similarity()
 340                 {
 341                         {
 342                                 for (int i = 0; i < 256; i++)
 343                                         NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i);
 344                         }
 345                 }
 346         }
 347 }