beagled/Lucene.Net/Search/Similarity.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 using Field = Lucene.Net.Documents.Field;
  18 using IndexReader = Lucene.Net.Index.IndexReader;
  19 using IndexWriter = Lucene.Net.Index.IndexWriter;
  20 using Term = Lucene.Net.Index.Term;
  21 namespace Lucene.Net.Search
  22 {
  23
  24
  25     /// <summary>Expert: Scoring API.
  26     /// <p>Subclasses implement search scoring.
  27     ///
  28     /// <p>The score of query <code>q</code> for document <code>d</code> is defined
  29     /// in terms of these methods as follows:
  30     ///
  31     /// <table cellpadding="0" cellspacing="0" border="0">
  32     /// <tr>
  33     /// <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
  34     /// <td valign="middle" align="center">
  35     /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
  36     /// <td valign="middle"><small>
  37     /// ( {@link #Tf(int) tf}(t in d) *
  38     /// {@link #Idf(Term,Searcher) idf}(t)^2 *
  39     /// {@link Query#getBoost getBoost}(t in q) *
  40     /// {@link Field#getBoost getBoost}(t.field in d) *
  41     /// {@link #LengthNorm(String,int) lengthNorm}(t.field in d) )
  42     /// </small></td>
  43     /// <td valign="middle" rowspan="2">&nbsp;*
  44     /// {@link #Coord(int,int) coord}(q,d) *
  45     /// {@link #QueryNorm(float) queryNorm}(sumOfSqaredWeights)
  46     /// </td>
  47     /// </tr>
  48     /// <tr>
  49     /// <td valign="top" align="right">
  50     /// <small>t in q</small>
  51     /// </td>
  52     /// </tr>
  53     /// </table>
  54     ///
  55     /// <p> where
  56     ///
  57     /// <table cellpadding="0" cellspacing="0" border="0">
  58     /// <tr>
  59     /// <td valign="middle" align="right" rowspan="2">sumOfSqaredWeights =<br></td>
  60     /// <td valign="middle" align="center">
  61     /// <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
  62     /// <td valign="middle"><small>
  63     /// ( {@link #Idf(Term,Searcher) idf}(t) *
  64     /// {@link Query#getBoost getBoost}(t in q) )^2
  65     /// </small></td>
  66     /// </tr>
  67     /// <tr>
  68     /// <td valign="top" align="right">
  69     /// <small>t in q</small>
  70     /// </td>
  71     /// </tr>
  72     /// </table>
  73     ///
  74     /// <p> Note that the above formula is motivated by the cosine-distance or dot-product
  75     /// between document and query vector, which is implemented by {@link DefaultSimilarity}.
  76     ///
  77     /// </summary>
  78     /// <seealso cref="#SetDefault(Similarity)">
  79     /// </seealso>
  80     /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
  81     /// </seealso>
  82     /// <seealso cref="Searcher#SetSimilarity(Similarity)">
  83     /// </seealso>
  84     [Serializable]
  85     public abstract class Similarity
  86     {
  87                 /// <summary>The Similarity implementation used by default. </summary>
  88                 private static Similarity defaultImpl = new DefaultSimilarity();
  89
  90                 /// <summary>Set the default Similarity implementation used by indexing and search
  91                 /// code.
  92                 ///
  93                 /// </summary>
  94                 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
  95                 /// </seealso>
  96                 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
  97                 /// </seealso>
  98                 public static void  SetDefault(Similarity similarity)
  99                 {
 100                         Similarity.defaultImpl = similarity;
 101                 }
 102
 103                 /// <summary>Return the default Similarity implementation used by indexing and search
 104                 /// code.
 105                 ///
 106                 /// <p>This is initially an instance of {@link DefaultSimilarity}.
 107                 ///
 108                 /// </summary>
 109                 /// <seealso cref="Searcher#SetSimilarity(Similarity)">
 110                 /// </seealso>
 111                 /// <seealso cref="IndexWriter#SetSimilarity(Similarity)">
 112                 /// </seealso>
 113                 public static Similarity GetDefault()
 114                 {
 115                         return Similarity.defaultImpl;
 116                 }
 117
 118                 /// <summary>Cache of decoded bytes. </summary>
 119                 private static readonly float[] NORM_TABLE = new float[256];
 120
 121                 /// <summary>Decodes a normalization factor stored in an index.</summary>
 122                 /// <seealso cref="#EncodeNorm(float)">
 123                 /// </seealso>
 124                 public static float DecodeNorm(byte b)
 125                 {
 126                         return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
 127                 }
 128
 129         /// <summary>Returns a table for decoding normalization bytes.</summary>
 130         /// <seealso cref="#EncodeNorm(float)">
 131         /// </seealso>
 132         public static float[] GetNormDecoder()
 133         {
 134             return NORM_TABLE;
 135         }
 136
 137         /// <summary>Computes the normalization value for a field given the total number of
 138         /// terms contained in a field.  These values, together with field boosts, are
 139         /// stored in an index and multipled into scores for hits on each field by the
 140         /// search code.
 141         ///
 142         /// <p>Matches in longer fields are less precise, so implementations of this
 143         /// method usually return smaller values when <code>numTokens</code> is large,
 144         /// and larger values when <code>numTokens</code> is small.
 145         ///
 146         /// <p>That these values are computed under {@link
 147         /// IndexWriter#AddDocument(Lucene.Net.document.Document)} and stored then using
 148         /// {@link #EncodeNorm(float)}.  Thus they have limited precision, and documents
 149         /// must be re-indexed if this method is altered.
 150         ///
 151         /// </summary>
 152         /// <param name="fieldName">the name of the field
 153         /// </param>
 154         /// <param name="numTokens">the total number of tokens contained in fields named
 155         /// <i>fieldName</i> of <i>doc</i>.
 156         /// </param>
 157         /// <returns> a normalization factor for hits on this field of this document
 158         ///
 159         /// </returns>
 160         /// <seealso cref="Field#SetBoost(float)">
 161         /// </seealso>
 162         public abstract float LengthNorm(System.String fieldName, int numTokens);
 163
 164                 /// <summary>Computes the normalization value for a query given the sum of the squared
 165                 /// weights of each of the query terms.  This value is then multipled into the
 166                 /// weight of each query term.
 167                 ///
 168                 /// <p>This does not affect ranking, but rather just attempts to make scores
 169                 /// from different queries comparable.
 170                 ///
 171                 /// </summary>
 172                 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights
 173                 /// </param>
 174                 /// <returns> a normalization factor for query weights
 175                 /// </returns>
 176                 public abstract float QueryNorm(float sumOfSquaredWeights);
 177
 178                 /// <summary>Encodes a normalization factor for storage in an index.
 179                 ///
 180                 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
 181                 /// representing values from around 7x10^9 to 2x10^-9 with about one
 182                 /// significant decimal digit of accuracy.  Zero is also represented.
 183                 /// Negative numbers are rounded up to zero.  Values too large to represent
 184                 /// are rounded down to the largest representable value.  Positive values too
 185                 /// small to represent are rounded up to the smallest positive representable
 186                 /// value.
 187                 ///
 188                 /// </summary>
 189                 /// <seealso cref="Field#SetBoost(float)">
 190                 /// </seealso>
 191                 public static byte EncodeNorm(float f)
 192                 {
 193                         return FloatToByte(f);
 194                 }
 195
 196                 private static float ByteToFloat(byte b)
 197                 {
 198                         if (b == 0)
 199                         // zero is a special case
 200                                 return 0.0f;
 201                         int mantissa = b & 7;
 202                         int exponent = (b >> 3) & 31;
 203                         int bits = ((exponent + (63 - 15)) << 24) | (mantissa << 21);
 204             return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0);
 205                 }
 206
 207                 private static byte FloatToByte(float f)
 208                 {
 209                         if (f < 0.0f)
 210                         // round negatives up to zero
 211                                 f = 0.0f;
 212
 213                         if (f == 0.0f)
 214                         // zero is a special case
 215                                 return 0;
 216
 217                         int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0); // parse float into parts
 218                         int mantissa = (bits & 0xffffff) >> 21;
 219                         int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
 220
 221                         if (exponent > 31)
 222                         {
 223                                 // overflow: use max value
 224                                 exponent = 31;
 225                                 mantissa = 7;
 226                         }
 227
 228                         if (exponent < 0)
 229                         {
 230                                 // underflow: use min value
 231                                 exponent = 0;
 232                                 mantissa = 1;
 233                         }
 234
 235                         return (byte) ((exponent << 3) | mantissa); // pack into a byte
 236                 }
 237
 238
 239                 /// <summary>Computes a score factor based on a term or phrase's frequency in a
 240                 /// document.  This value is multiplied by the {@link #Idf(Term, Searcher)}
 241                 /// factor for each term in the query and these products are then summed to
 242                 /// form the initial score for a document.
 243                 ///
 244                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 245                 /// document, so implementations of this method usually return larger values
 246                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 247                 /// is small.
 248                 ///
 249                 /// <p>The default implementation calls {@link #Tf(float)}.
 250                 ///
 251                 /// </summary>
 252                 /// <param name="freq">the frequency of a term within a document
 253                 /// </param>
 254                 /// <returns> a score factor based on a term's within-document frequency
 255                 /// </returns>
 256                 public virtual float Tf(int freq)
 257                 {
 258                         return Tf((float) freq);
 259                 }
 260
 261                 /// <summary>Computes the amount of a sloppy phrase match, based on an edit distance.
 262                 /// This value is summed for each sloppy phrase match in a document to form
 263                 /// the frequency that is passed to {@link #Tf(float)}.
 264                 ///
 265                 /// <p>A phrase match with a small edit distance to a document passage more
 266                 /// closely matches the document, so implementations of this method usually
 267                 /// return larger values when the edit distance is small and smaller values
 268                 /// when it is large.
 269                 ///
 270                 /// </summary>
 271                 /// <seealso cref="PhraseQuery#SetSlop(int)">
 272                 /// </seealso>
 273                 /// <param name="distance">the edit distance of this sloppy phrase match
 274                 /// </param>
 275                 /// <returns> the frequency increment for this match
 276                 /// </returns>
 277                 public abstract float SloppyFreq(int distance);
 278
 279                 /// <summary>Computes a score factor based on a term or phrase's frequency in a
 280                 /// document.  This value is multiplied by the {@link #Idf(Term, Searcher)}
 281                 /// factor for each term in the query and these products are then summed to
 282                 /// form the initial score for a document.
 283                 ///
 284                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 285                 /// document, so implementations of this method usually return larger values
 286                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 287                 /// is small.
 288                 ///
 289                 /// </summary>
 290                 /// <param name="freq">the frequency of a term within a document
 291                 /// </param>
 292                 /// <returns> a score factor based on a term's within-document frequency
 293                 /// </returns>
 294                 public abstract float Tf(float freq);
 295
 296                 /// <summary>Computes a score factor for a simple term.
 297                 ///
 298                 /// <p>The default implementation is:<pre>
 299                 /// return idf(searcher.docFreq(term), searcher.maxDoc());
 300                 /// </pre>
 301                 ///
 302                 /// Note that {@link Searcher#MaxDoc()} is used instead of
 303                 /// {@link IndexReader#NumDocs()} because it is proportional to
 304                 /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
 305                 /// so is the other, and in the same direction.
 306                 ///
 307                 /// </summary>
 308                 /// <param name="term">the term in question
 309                 /// </param>
 310                 /// <param name="searcher">the document collection being searched
 311                 /// </param>
 312                 /// <returns> a score factor for the term
 313                 /// </returns>
 314                 public virtual float Idf(Term term, Searcher searcher)
 315                 {
 316                         return Idf(searcher.DocFreq(term), searcher.MaxDoc());
 317                 }
 318
 319                 /// <summary>Computes a score factor for a phrase.
 320                 ///
 321                 /// <p>The default implementation sums the {@link #Idf(Term,Searcher)} factor
 322                 /// for each term in the phrase.
 323                 ///
 324                 /// </summary>
 325                 /// <param name="terms">the terms in the phrase
 326                 /// </param>
 327                 /// <param name="searcher">the document collection being searched
 328                 /// </param>
 329                 /// <returns> a score factor for the phrase
 330                 /// </returns>
 331                 public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
 332                 {
 333                         float idf = 0.0f;
 334                         System.Collections.IEnumerator i = terms.GetEnumerator();
 335                         while (i.MoveNext())
 336                         {
 337                                 idf += Idf((Term) i.Current, searcher);
 338                         }
 339                         return idf;
 340                 }
 341
 342                 /// <summary>Computes a score factor based on a term's document frequency (the number
 343                 /// of documents which contain the term).  This value is multiplied by the
 344                 /// {@link #Tf(int)} factor for each term in the query and these products are
 345                 /// then summed to form the initial score for a document.
 346                 ///
 347                 /// <p>Terms that occur in fewer documents are better indicators of topic, so
 348                 /// implementations of this method usually return larger values for rare terms,
 349                 /// and smaller values for common terms.
 350                 ///
 351                 /// </summary>
 352                 /// <param name="docFreq">the number of documents which contain the term
 353                 /// </param>
 354                 /// <param name="numDocs">the total number of documents in the collection
 355                 /// </param>
 356                 /// <returns> a score factor based on the term's document frequency
 357                 /// </returns>
 358                 public abstract float Idf(int docFreq, int numDocs);
 359
 360                 /// <summary>Computes a score factor based on the fraction of all query terms that a
 361                 /// document contains.  This value is multiplied into scores.
 362                 ///
 363                 /// <p>The presence of a large portion of the query terms indicates a better
 364                 /// match with the query, so implementations of this method usually return
 365                 /// larger values when the ratio between these parameters is large and smaller
 366                 /// values when the ratio between them is small.
 367                 ///
 368                 /// </summary>
 369                 /// <param name="overlap">the number of query terms matched in the document
 370                 /// </param>
 371                 /// <param name="maxOverlap">the total number of terms in the query
 372                 /// </param>
 373                 /// <returns> a score factor based on term overlap with the query
 374                 /// </returns>
 375                 public abstract float Coord(int overlap, int maxOverlap);
 376                 static Similarity()
 377                 {
 378                         {
 379                                 for (int i = 0; i < 256; i++)
 380                                         NORM_TABLE[i] = ByteToFloat((byte) i);
 381                         }
 382                 }
 383         }
 384 }