documentation/manual/en/module_specs/Zend_Search_Lucene-Extending.xml

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <!-- Reviewed: no -->
   3 <sect1 id="zend.search.lucene.extending">
   4     <title>Extensibility</title>
   5
   6     <sect2 id="zend.search.lucene.extending.analysis">
   7         <title>Text Analysis</title>
   8
   9         <para>
  10             The <classname>Zend_Search_Lucene_Analysis_Analyzer</classname> class is used by the
  11             indexer to tokenize document text fields.
  12         </para>
  13
  14         <para>
  15             The <methodname>Zend_Search_Lucene_Analysis_Analyzer::getDefault()</methodname> and
  16             <code>Zend_Search_Lucene_Analysis_Analyzer::setDefault()</code> methods are used
  17             to get and set the default analyzer.
  18         </para>
  19
  20         <para>
  21             You can assign your own text analyzer or choose it from the set of predefined analyzers:
  22             <classname>Zend_Search_Lucene_Analysis_Analyzer_Common_Text</classname> and
  23             <classname>Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive</classname>
  24             (default). Both of them interpret tokens as sequences of letters.
  25             <classname>Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive</classname>
  26             converts all tokens to lower case.
  27         </para>
  28
  29         <para>
  30             To switch between analyzers:
  31         </para>
  32
  33         <programlisting language="php"><![CDATA[
  34 Zend_Search_Lucene_Analysis_Analyzer::setDefault(
  35     new Zend_Search_Lucene_Analysis_Analyzer_Common_Text());
  36 ...
  37 $index->addDocument($doc);
  38 ]]></programlisting>
  39
  40         <para>
  41             The <classname>Zend_Search_Lucene_Analysis_Analyzer_Common</classname> class is designed
  42             to be an ancestor of all user defined analyzers. User should only define the
  43             <methodname>reset()</methodname> and <methodname>nextToken()</methodname> methods, which
  44             takes its string from the $_input member and returns tokens one by one (a
  45             <constant>NULL</constant> value indicates the end of the stream).
  46         </para>
  47
  48         <para>
  49             The <methodname>nextToken()</methodname> method should call the
  50             <methodname>normalize()</methodname> method on each token. This will allow you to use
  51             token filters with your analyzer.
  52         </para>
  53
  54         <para>
  55             Here is an example of a custom analyzer, which accepts words with digits as terms:
  56
  57             <example id="zend.search.lucene.extending.analysis.example-1">
  58                 <title>Custom text Analyzer</title>
  59
  60                 <programlisting language="php"><![CDATA[
  61 /**
  62  * Here is a custom text analyser, which treats words with digits as
  63  * one term
  64  */
  65
  66 class My_Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
  67 {
  68     private $_position;
  69
  70     /**
  71      * Reset token stream
  72      */
  73     public function reset()
  74     {
  75         $this->_position = 0;
  76     }
  77
  78     /**
  79      * Tokenization stream API
  80      * Get next token
  81      * Returns null at the end of stream
  82      *
  83      * @return Zend_Search_Lucene_Analysis_Token|null
  84      */
  85     public function nextToken()
  86     {
  87         if ($this->_input === null) {
  88             return null;
  89         }
  90
  91         while ($this->_position < strlen($this->_input)) {
  92             // skip white space
  93             while ($this->_position < strlen($this->_input) &&
  94                    !ctype_alnum( $this->_input[$this->_position] )) {
  95                 $this->_position++;
  96             }
  97
  98             $termStartPosition = $this->_position;
  99
 100             // read token
 101             while ($this->_position < strlen($this->_input) &&
 102                    ctype_alnum( $this->_input[$this->_position] )) {
 103                 $this->_position++;
 104             }
 105
 106             // Empty token, end of stream.
 107             if ($this->_position == $termStartPosition) {
 108                 return null;
 109             }
 110
 111             $token = new Zend_Search_Lucene_Analysis_Token(
 112                                       substr($this->_input,
 113                                              $termStartPosition,
 114                                              $this->_position -
 115                                              $termStartPosition),
 116                                       $termStartPosition,
 117                                       $this->_position);
 118             $token = $this->normalize($token);
 119             if ($token !== null) {
 120                 return $token;
 121             }
 122             // Continue if token is skipped
 123         }
 124
 125         return null;
 126     }
 127 }
 128
 129 Zend_Search_Lucene_Analysis_Analyzer::setDefault(
 130     new My_Analyzer());
 131 ]]></programlisting>
 132             </example>
 133         </para>
 134     </sect2>
 135
 136     <sect2 id="zend.search.lucene.extending.filters">
 137         <title>Tokens Filtering</title>
 138
 139         <para>
 140             The <classname>Zend_Search_Lucene_Analysis_Analyzer_Common</classname> analyzer also
 141             offers a token filtering mechanism.
 142         </para>
 143
 144         <para>
 145             The <classname>Zend_Search_Lucene_Analysis_TokenFilter</classname> class provides an
 146             abstract interface for such filters. Your own filters should extend this class either
 147             directly or indirectly.
 148         </para>
 149
 150         <para>
 151             Any custom filter must implement the <methodname>normalize()</methodname> method which
 152             may transform input token or signal that the current token should be skipped.
 153         </para>
 154
 155         <para>
 156             There are three filters already defined in the analysis subpackage:
 157
 158             <itemizedlist>
 159                 <listitem>
 160                     <para>
 161                         <classname>Zend_Search_Lucene_Analysis_TokenFilter_LowerCase</classname>
 162                     </para>
 163                 </listitem>
 164
 165                 <listitem>
 166                     <para>
 167                         <classname>Zend_Search_Lucene_Analysis_TokenFilter_ShortWords</classname>
 168                     </para>
 169                 </listitem>
 170
 171                 <listitem>
 172                     <para>
 173                         <classname>Zend_Search_Lucene_Analysis_TokenFilter_StopWords</classname>
 174                     </para>
 175                 </listitem>
 176             </itemizedlist>
 177         </para>
 178
 179         <para>
 180             The <code>LowerCase</code> filter is already used for
 181             <classname>Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive</classname>
 182             analyzer by default.
 183         </para>
 184
 185         <para>
 186             The <code>ShortWords</code> and <code>StopWords</code> filters may be used with
 187             pre-defined or custom analyzers like this:
 188
 189             <programlisting language="php"><![CDATA[
 190 $stopWords = array('a', 'an', 'at', 'the', 'and', 'or', 'is', 'am');
 191 $stopWordsFilter =
 192     new Zend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords);
 193
 194 $analyzer =
 195     new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
 196 $analyzer->addFilter($stopWordsFilter);
 197
 198 Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer);
 199 ]]></programlisting>
 200             <programlisting language="php"><![CDATA[
 201 $shortWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_ShortWords();
 202
 203 $analyzer =
 204     new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
 205 $analyzer->addFilter($shortWordsFilter);
 206
 207 Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer);
 208 ]]></programlisting>
 209         </para>
 210
 211         <para>
 212             The <classname>Zend_Search_Lucene_Analysis_TokenFilter_StopWords</classname> constructor
 213             takes an array of stop-words as an input. But stop-words may be also loaded from a file:
 214
 215             <programlisting language="php"><![CDATA[
 216 $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords();
 217 $stopWordsFilter->loadFromFile($my_stopwords_file);
 218
 219 $analyzer =
 220    new Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive();
 221 $analyzer->addFilter($stopWordsFilter);
 222
 223 Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer);
 224 ]]></programlisting>
 225
 226             This file should be a common text file with one word in each line. The '#' character
 227             marks a line as a comment.
 228         </para>
 229
 230         <para>
 231             The <classname>Zend_Search_Lucene_Analysis_TokenFilter_ShortWords</classname>
 232             constructor has one optional argument. This is the word length limit, set by default to
 233             2.
 234         </para>
 235     </sect2>
 236
 237     <sect2 id="zend.search.lucene.extending.scoring">
 238         <title>Scoring Algorithms</title>
 239
 240         <para>
 241             The score of a document <literal>d</literal> for a query <literal>q</literal>
 242             is defined as follows:
 243         </para>
 244
 245         <para>
 246             <code>score(q,d) = sum( tf(t in d) * idf(t) * getBoost(t.field in d) *
 247                 lengthNorm(t.field in d) ) * coord(q,d) * queryNorm(q)</code>
 248         </para>
 249
 250         <para>
 251             tf(t in d) - <methodname>Zend_Search_Lucene_Search_Similarity::tf($freq)</methodname> -
 252             a score factor based on the frequency of a term or phrase in a document.
 253         </para>
 254
 255         <para>
 256             idf(t) - <methodname>Zend_Search_Lucene_Search_Similarity::idf($input,
 257                 $reader)</methodname> - a score factor for a simple term with the specified index.
 258         </para>
 259
 260         <para>
 261             getBoost(t.field in d) - the boost factor for the term field.
 262         </para>
 263
 264         <para>
 265             lengthNorm($term) - the normalization value for a field given the total
 266             number of terms contained in a field. This value is stored within the index.
 267             These values, together with field boosts, are stored in an index and multiplied
 268             into scores for hits on each field by the search code.
 269         </para>
 270
 271         <para>
 272             Matches in longer fields are less precise, so implementations of this method usually
 273             return smaller values when numTokens is large, and larger values when numTokens is
 274             small.
 275         </para>
 276
 277         <para>
 278             coord(q,d) - <methodname>Zend_Search_Lucene_Search_Similarity::coord($overlap,
 279                 $maxOverlap)</methodname> - a score factor based on the fraction of all query terms
 280             that a document contains.
 281         </para>
 282
 283         <para>
 284             The presence of a large portion of the query terms indicates a better match
 285             with the query, so implementations of this method usually return larger values
 286             when the ratio between these parameters is large and smaller values when
 287             the ratio between them is small.
 288         </para>
 289
 290         <para>
 291             queryNorm(q) - the normalization value for a query given the sum of the squared weights
 292             of each of the query terms. This value is then multiplied into the weight of each query
 293             term.
 294         </para>
 295
 296         <para>
 297             This does not affect ranking, but rather just attempts to make scores from different
 298             queries comparable.
 299         </para>
 300
 301         <para>
 302             The scoring algorithm can be customized by defining your own Similarity class. To do
 303             this extend the <classname>Zend_Search_Lucene_Search_Similarity</classname> class as
 304             defined below, then use the
 305             <classname>Zend_Search_Lucene_Search_Similarity::setDefault($similarity);</classname>
 306             method to set it as default.
 307         </para>
 308
 309         <programlisting language="php"><![CDATA[
 310 class MySimilarity extends Zend_Search_Lucene_Search_Similarity {
 311     public function lengthNorm($fieldName, $numTerms) {
 312         return 1.0/sqrt($numTerms);
 313     }
 314
 315     public function queryNorm($sumOfSquaredWeights) {
 316         return 1.0/sqrt($sumOfSquaredWeights);
 317     }
 318
 319     public function tf($freq) {
 320         return sqrt($freq);
 321     }
 322
 323     /**
 324      * It's not used now. Computes the amount of a sloppy phrase match,
 325      * based on an edit distance.
 326      */
 327     public function sloppyFreq($distance) {
 328         return 1.0;
 329     }
 330
 331     public function idfFreq($docFreq, $numDocs) {
 332         return log($numDocs/(float)($docFreq+1)) + 1.0;
 333     }
 334
 335     public function coord($overlap, $maxOverlap) {
 336         return $overlap/(float)$maxOverlap;
 337     }
 338 }
 339
 340 $mySimilarity = new MySimilarity();
 341 Zend_Search_Lucene_Search_Similarity::setDefault($mySimilarity);
 342 ]]></programlisting>
 343     </sect2>
 344
 345     <sect2 id="zend.search.lucene.extending.storage">
 346         <title>Storage Containers</title>
 347
 348         <para>
 349             The abstract class <classname>Zend_Search_Lucene_Storage_Directory</classname> defines
 350             directory functionality.
 351         </para>
 352
 353         <para>
 354             The <classname>Zend_Search_Lucene</classname> constructor uses either a string or a
 355             <classname>Zend_Search_Lucene_Storage_Directory</classname> object as an input.
 356         </para>
 357
 358         <para>
 359             The <classname>Zend_Search_Lucene_Storage_Directory_Filesystem</classname> class
 360             implements directory functionality for a file system.
 361         </para>
 362
 363         <para>
 364             If a string is used as an input for the <classname>Zend_Search_Lucene</classname>
 365             constructor, then the index reader (<classname>Zend_Search_Lucene</classname> object)
 366             treats it as a file system path and instantiates the
 367             <classname>Zend_Search_Lucene_Storage_Directory_Filesystem</classname> object.
 368         </para>
 369
 370         <para>
 371             You can define your own directory implementation by extending the
 372             <classname>Zend_Search_Lucene_Storage_Directory</classname> class.
 373         </para>
 374
 375         <para>
 376             <classname>Zend_Search_Lucene_Storage_Directory</classname> methods:
 377         </para>
 378
 379         <programlisting language="php"><![CDATA[
 380 abstract class Zend_Search_Lucene_Storage_Directory {
 381 /**
 382  * Closes the store.
 383  *
 384  * @return void
 385  */
 386 abstract function close();
 387
 388 /**
 389  * Creates a new, empty file in the directory with the given $filename.
 390  *
 391  * @param string $name
 392  * @return void
 393  */
 394 abstract function createFile($filename);
 395
 396 /**
 397  * Removes an existing $filename in the directory.
 398  *
 399  * @param string $filename
 400  * @return void
 401  */
 402 abstract function deleteFile($filename);
 403
 404 /**
 405  * Returns true if a file with the given $filename exists.
 406  *
 407  * @param string $filename
 408  * @return boolean
 409  */
 410 abstract function fileExists($filename);
 411
 412 /**
 413  * Returns the length of a $filename in the directory.
 414  *
 415  * @param string $filename
 416  * @return integer
 417  */
 418 abstract function fileLength($filename);
 419
 420 /**
 421  * Returns the UNIX timestamp $filename was last modified.
 422  *
 423  * @param string $filename
 424  * @return integer
 425  */
 426 abstract function fileModified($filename);
 427
 428 /**
 429  * Renames an existing file in the directory.
 430  *
 431  * @param string $from
 432  * @param string $to
 433  * @return void
 434  */
 435 abstract function renameFile($from, $to);
 436
 437 /**
 438  * Sets the modified time of $filename to now.
 439  *
 440  * @param string $filename
 441  * @return void
 442  */
 443 abstract function touchFile($filename);
 444
 445 /**
 446  * Returns a Zend_Search_Lucene_Storage_File object for a given
 447  * $filename in the directory.
 448  *
 449  * @param string $filename
 450  * @return Zend_Search_Lucene_Storage_File
 451  */
 452 abstract function getFileObject($filename);
 453
 454 }
 455 ]]></programlisting>
 456
 457         <para>
 458             The <methodname>getFileObject($filename)</methodname> method of a
 459             <classname>Zend_Search_Lucene_Storage_Directory</classname> instance returns a
 460             <classname>Zend_Search_Lucene_Storage_File</classname> object.
 461         </para>
 462
 463         <para>
 464             The <classname>Zend_Search_Lucene_Storage_File</classname> abstract class implements
 465             file abstraction and index file reading primitives.
 466         </para>
 467
 468         <para>
 469             You must also extend <classname>Zend_Search_Lucene_Storage_File</classname> for your
 470             directory implementation.
 471         </para>
 472
 473         <para>
 474             Only two methods of <classname>Zend_Search_Lucene_Storage_File</classname> must be
 475             overridden in your implementation:
 476         </para>
 477
 478         <programlisting language="php"><![CDATA[
 479 class MyFile extends Zend_Search_Lucene_Storage_File {
 480     /**
 481      * Sets the file position indicator and advances the file pointer.
 482      * The new position, measured in bytes from the beginning of the file,
 483      * is obtained by adding offset to the position specified by whence,
 484      * whose values are defined as follows:
 485      * SEEK_SET - Set position equal to offset bytes.
 486      * SEEK_CUR - Set position to current location plus offset.
 487      * SEEK_END - Set position to end-of-file plus offset. (To move to
 488      * a position before the end-of-file, you need to pass a negative value
 489      * in offset.)
 490      * Upon success, returns 0; otherwise, returns -1
 491      *
 492      * @param integer $offset
 493      * @param integer $whence
 494      * @return integer
 495      */
 496     public function seek($offset, $whence=SEEK_SET) {
 497         ...
 498     }
 499
 500     /**
 501      * Read a $length bytes from the file and advance the file pointer.
 502      *
 503      * @param integer $length
 504      * @return string
 505      */
 506     protected function _fread($length=1) {
 507         ...
 508     }
 509 }
 510 ]]></programlisting>
 511     </sect2>
 512 </sect1>
 513 <!--
 514 vim:se ts=4 sw=4 et:
 515 -->