Lucene.Net/Search/Similarity.cs

   1 using System;
   2 using System.Collections;
   3
   4 using Lucene.Net.Index;
   5 using Lucene.Net.Documents;
   6
   7 namespace Lucene.Net.Search
   8 {
   9         /* ====================================================================
  10          * The Apache Software License, Version 1.1
  11          *
  12          * Copyright (c) 2001 The Apache Software Foundation.  All rights
  13          * reserved.
  14          *
  15          * Redistribution and use in source and binary forms, with or without
  16          * modification, are permitted provided that the following conditions
  17          * are met:
  18          *
  19          * 1. Redistributions of source code must retain the above copyright
  20          *    notice, this list of conditions and the following disclaimer.
  21          *
  22          * 2. Redistributions in binary form must reproduce the above copyright
  23          *    notice, this list of conditions and the following disclaimer in
  24          *    the documentation and/or other materials provided with the
  25          *    distribution.
  26          *
  27          * 3. The end-user documentation included with the redistribution,
  28          *    if any, must include the following acknowledgment:
  29          *       "This product includes software developed by the
  30          *        Apache Software Foundation (http://www.apache.org/)."
  31          *    Alternately, this acknowledgment may appear in the software itself,
  32          *    if and wherever such third-party acknowledgments normally appear.
  33          *
  34          * 4. The names "Apache" and "Apache Software Foundation" and
  35          *    "Apache Lucene" must not be used to endorse or promote products
  36          *    derived from this software without prior written permission. For
  37          *    written permission, please contact apache@apache.org.
  38          *
  39          * 5. Products derived from this software may not be called "Apache",
  40          *    "Apache Lucene", nor may "Apache" appear in their name, without
  41          *    prior written permission of the Apache Software Foundation.
  42          *
  43          * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  44          * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  45          * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  46          * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  47          * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  48          * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  49          * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  50          * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  51          * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  52          * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  53          * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  54          * SUCH DAMAGE.
  55          * ====================================================================
  56          *
  57          * This software consists of voluntary contributions made by many
  58          * individuals on behalf of the Apache Software Foundation.  For more
  59          * information on the Apache Software Foundation, please see
  60          * <http://www.apache.org/>.
  61          */
  62
  63         /// <summary>
  64         /// Expert: Scoring API.
  65         /// <p>Subclasses implement search scoring.</p>
  66         /// <p>The score of query <code>q</code> for document <code>d</code> is defined
  67         /// in terms of these methods as follows:</p>
  68         /// <table cellpadding="0" cellspacing="0" border="0">
  69         ///  <tr>
  70         ///    <td valign="middle" align="right" rowspan="2">score(q,d) =<br/></td>
  71         ///    <td valign="middle" align="center">
  72         ///    <big><big><big><big><big>Sigma;</big></big></big></big></big></td>
  73         ///    <td valign="middle"><small>
  74         ///             Tf(int) tf}(t in d)
  75         ///     Idf(Term,Searcher) idf}(t)
  76         ///             Field.GetBoost GetBoost(t.field in d)
  77         ///     LengthNorm(String,int) lengthNorm(t.field in d)
  78         ///    </small></td>
  79         ///    <td valign="middle" rowspan="2">
  80         ///     Coord(int,int) Coord(q,d)
  81         ///     QueryNorm(float) QueryNorm(q)
  82         ///    </td>
  83         ///  </tr>
  84         ///  <tr>
  85         ///   <td valign="top" align="right">
  86         ///    <small>t in q</small>
  87         ///    </td>
  88         ///  </tr>
  89         /// </table>
  90         ///
  91         /// <see cref="SetDefault(Similarity)"/>
  92         /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
  93         /// <see cref="Searcher.SetSimilarity(Similarity)"/>
  94         /// </summary>
  95         public abstract class Similarity
  96         {
  97                 /// <summary>
  98                 /// The Similarity implementation used by default.
  99                 /// </summary>
 100                 private static Similarity defaultImpl = new DefaultSimilarity();
 101
 102                 /// <summary>
 103                 /// Set the default Similarity implementation used by indexing and search
 104                 /// code.
 105                 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
 106                 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
 107                 /// </summary>
 108                 /// <param name="similarity"></param>
 109                 public static void SetDefault(Similarity similarity)
 110                 {
 111                         Similarity.defaultImpl = similarity;
 112                 }
 113
 114                 /// <summary>
 115                 /// Return the default Similarity implementation used by indexing and search
 116                 /// code.
 117                 /// <p>This is initially an instance of DefaultSimilarity.</p>
 118                 /// <see cref="Searcher.SetSimilarity(Similarity)"/>
 119                 /// <see cref="IndexWriter.SetSimilarity(Similarity)"/>
 120                 /// </summary>
 121                 /// <returns></returns>
 122                 public static Similarity GetDefault()
 123                 {
 124                         return Similarity.defaultImpl;
 125                 }
 126
 127                 /// <summary>
 128                 /// Cache of decoded bytes.
 129                 /// </summary>
 130                 private static readonly float[] NORM_TABLE = new float[256];
 131
 132                 static Similarity()
 133                 {
 134                         for (int i = 0; i < 256; i++)
 135                                 NORM_TABLE[i] = ByteToFloat((byte)i);
 136                 }
 137
 138                 /// <summary>
 139                 /// Decodes a normalization factor stored in an index.
 140                 /// <see cref="EncodeNorm(float)"/>
 141                 /// </summary>
 142                 /// <param name="b"></param>
 143                 /// <returns></returns>
 144                 public static float DecodeNorm(byte b)
 145                 {
 146                         return NORM_TABLE[b & 0xFF];
 147                 }
 148
 149                 /// <summary>
 150                 /// Computes the normalization value for a field given the total number of
 151                 /// terms contained in a field.  These values, together with field boosts, are
 152                 /// stored in an index and multipled into scores for hits on each field by the
 153                 /// search code.
 154                 ///
 155                 /// <p>Matches in longer fields are less precise, so implemenations of this
 156                 /// method usually return smaller values when <code>numTokens</code> is large,
 157                 /// and larger values when <code>numTokens</code> is small.</p>
 158                 ///
 159                 /// <p>That these values are computed under
 160                 /// IndexWriter.AddDocument(Document) and stored then using
 161                 /// EncodeNorm(float).  Thus they have limited precision, and documents
 162                 /// must be re-indexed if this method is altered.
 163                 /// </p>
 164                 ///
 165                 /// <see cref="Field.SetBoost(float)"/>
 166                 /// </summary>
 167                 /// <param name="fieldName">the name of the field</param>
 168                 /// <param name="numTokens">
 169                 ///             the total number of tokens contained in fields named
 170                 ///             <i>fieldName</i> of <i>doc</i>.
 171                 /// </param>
 172                 /// <returns>a normalization factor for hits on this field of this document</returns>
 173                 public abstract float LengthNorm(String fieldName, int numTokens);
 174
 175                 /// <summary>
 176                 /// Computes the normalization value for a query given the sum of the squared
 177                 /// weights of each of the query terms.  This value is then multipled into the
 178                 /// weight of each query term.
 179                 ///
 180                 /// <p>This does not affect ranking, but rather just attempts to make scores
 181                 /// from different queries comparable.
 182                 /// </p>
 183                 /// </summary>
 184                 /// <param name="sumOfSquaredWeights">the sum of the squares of query term weights</param>
 185                 /// <returns>a normalization factor for query weights</returns>
 186                 public abstract float QueryNorm(float sumOfSquaredWeights);
 187
 188                 /// <summary>
 189                 /// Encodes a normalization factor for storage in an index.
 190                 ///
 191                 /// <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
 192                 /// representing values from around 7x10^9 to 2x10^-9 with about one
 193                 /// significant decimal digit of accuracy.  Zero is also represented.
 194                 /// Negative numbers are rounded up to zero.  Values too large to represent
 195                 /// are rounded down to the largest representable value.  Positive values too
 196                 /// small to represent are rounded up to the smallest positive representable
 197                 /// value.
 198                 /// </p>
 199                 ///
 200                 /// <see cref="Field.SetBoost(float)"/>
 201                 /// </summary>
 202                 /// <param name="f"></param>
 203                 /// <returns></returns>
 204                 public static byte EncodeNorm(float f)
 205                 {
 206                         return FloatToByte(f);
 207                 }
 208
 209                 private static float ByteToFloat(byte b)
 210                 {
 211                         if (b == 0)                                   // zero is a special case
 212                                 return 0.0f;
 213                         int mantissa = b & 7;
 214                         int exponent = (b >> 3) & 31;
 215                         int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
 216
 217
 218                         return BitConverter.ToSingle(BitConverter.GetBytes(bits), 0);
 219                 }
 220
 221                 private static byte FloatToByte(float f)
 222                 {
 223                         if (f < 0.0f)                                 // round negatives up to zero
 224                                 f = 0.0f;
 225
 226                         if (f == 0.0f)                                // zero is a special case
 227                                 return 0;
 228
 229                         int bits = BitConverter.ToInt32(BitConverter.GetBytes(f), 0);           // parse float into parts
 230                         int mantissa = (bits & 0xffffff) >> 21;
 231                         int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
 232
 233                         if (exponent > 31)
 234                         {                          // overflow: use max value
 235                                 exponent = 31;
 236                                 mantissa = 7;
 237                         }
 238
 239                         if (exponent < 0)
 240                         {                           // underflow: use min value
 241                                 exponent = 0;
 242                                 mantissa = 1;
 243                         }
 244
 245                         return (byte)((exponent << 3) | mantissa);    // pack into a byte
 246                 }
 247
 248                 /// <summary>
 249                 /// Computes a score factor based on a term or phrase's frequency in a
 250                 /// document.  This value is multiplied by the Idf(Term, Searcher)
 251                 /// factor for each term in the query and these products are then summed to
 252                 /// form the initial score for a document.
 253                 ///
 254                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 255                 /// document, so implemenations of this method usually return larger values
 256                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 257                 /// is small.
 258                 /// </p>
 259                 ///
 260                 /// <p>The default implementation calls Tf(float).</p>
 261                 /// </summary>
 262                 /// <param name="freq">the frequency of a term within a document</param>
 263                 /// <returns>a score factor based on a term's within-document frequency</returns>
 264                 virtual public float Tf(int freq)
 265                 {
 266                         return Tf((float)freq);
 267                 }
 268
 269                 /// <summary>
 270                 /// Computes the amount of a sloppy phrase match, based on an edit distance.
 271                 /// This value is summed for each sloppy phrase match in a document to form
 272                 /// the frequency that is passed to Tf(float).
 273                 ///
 274                 /// <p>A phrase match with a small edit distance to a document passage more
 275                 /// closely matches the document, so implemenations of this method usually
 276                 /// return larger values when the edit distance is small and smaller values
 277                 /// when it is large.
 278                 /// </p>
 279                 ///
 280                 /// <see cref="PhraseQuery.SetSlop(int)"/>
 281                 /// </summary>
 282                 /// <param name="distance">the edit distance of this sloppy phrase match</param>
 283                 /// <returns>the frequency increment for this match</returns>
 284                 public abstract float SloppyFreq(int distance);
 285
 286                 /// <summary>
 287                 /// Computes a score factor based on a term or phrase's frequency in a
 288                 /// document.  This value is multiplied by the Idf(Term, Searcher)
 289                 /// factor for each term in the query and these products are then summed to
 290                 /// form the initial score for a document.
 291                 ///
 292                 /// <p>Terms and phrases repeated in a document indicate the topic of the
 293                 /// document, so implemenations of this method usually return larger values
 294                 /// when <code>freq</code> is large, and smaller values when <code>freq</code>
 295                 /// is small.
 296                 /// </p>
 297                 ///
 298                 /// </summary>
 299                 /// <param name="freq">the frequency of a term within a document</param>
 300                 /// <returns>a score factor based on a term's within-document frequency</returns>
 301                 public abstract float Tf(float freq);
 302
 303                 /// <summary>
 304                 /// Computes a score factor for a simple term.
 305                 ///
 306                 /// <p>The default implementation is:
 307                 /// <pre>
 308                 ///   return Idf(searcher.DocFreq(term), searcher.MaxDoc());
 309                 /// </pre>
 310                 ///
 311                 /// Note that Searcher.MaxDoc() is used instead of
 312                 /// IndexReader.NumDocs() because it is proportional to
 313                 /// Searcher.DocFreq(Term), i.e., when one is inaccurate, so is the other,
 314                 /// and in the same direction.
 315                 /// </p>
 316                 ///
 317                 /// </summary>
 318                 /// <param name="term">the term in question</param>
 319                 /// <param name="searcher">the document collection being searched</param>
 320                 /// <returns>a score factor for the term</returns>
 321                 virtual public float Idf(Term term, Searcher searcher)
 322                 {
 323                         return Idf(searcher.DocFreq(term), searcher.MaxDoc());
 324                 }
 325
 326                 /// <summary>
 327                 /// Computes a score factor for a phrase.
 328                 /// <p>The default implementation sums the Idf(Term,Searcher) factor
 329                 /// for each term in the phrase.
 330                 /// </p>
 331                 /// </summary>
 332                 /// <param name="terms">the vector of terms in the phrase</param>
 333                 /// <param name="searcher">the document collection being searched</param>
 334                 /// <returns>a score factor for the phrase</returns>
 335                 virtual public float Idf(ArrayList terms, Searcher searcher)
 336                 {
 337                         float _idf = 0.0f;
 338                         for (int i = 0; i < terms.Count; i++)
 339                         {
 340                                 _idf += Idf((Term)terms[i], searcher);
 341                         }
 342                         return _idf;
 343                 }
 344
 345                 /// <summary>
 346                 /// Computes a score factor based on a term's document frequency (the number
 347                 /// of documents which contain the term).  This value is multiplied by the
 348                 /// Tf(int) factor for each term in the query and these products are
 349                 /// then summed to form the initial score for a document.
 350                 ///
 351                 /// <p>Terms that occur in fewer documents are better indicators of topic, so
 352                 /// implemenations of this method usually return larger values for rare terms,
 353                 /// and smaller values for common terms.
 354                 /// </p>
 355                 /// </summary>
 356                 /// <param name="docFreq">the number of documents which contain the term</param>
 357                 /// <param name="numDocs">the total number of documents in the collection</param>
 358                 /// <returns>a score factor based on the term's document frequency</returns>
 359                 public abstract float Idf(int docFreq, int numDocs);
 360
 361                 /// <summary>
 362                 /// Computes a score factor based on the fraction of all query terms that a
 363                 /// document contains.  This value is multiplied into scores.
 364                 ///
 365                 /// <p>The presence of a large portion of the query terms indicates a better
 366                 /// match with the query, so implemenations of this method usually return
 367                 /// larger values when the ratio between these parameters is large and smaller
 368                 /// values when the ratio between them is small.
 369                 /// </p>
 370                 /// </summary>
 371                 /// <param name="overlap">the number of query terms matched in the document</param>
 372                 /// <param name="maxOverlap">the total number of terms in the query</param>
 373                 /// <returns>a score factor based on term overlap with the query</returns>
 374                 public abstract float Coord(int overlap, int maxOverlap);
 375         }
 376 }