documentation/manual/en/module_specs/Zend_Search_Lucene-IndexCreation.xml

   1 <?xml version="1.0" encoding="UTF-8"?>
   2 <!-- Reviewed: no -->
   3 <sect1 id="zend.search.lucene.index-creation">
   4     <title>Building Indexes</title>
   5
   6     <sect2 id="zend.search.lucene.index-creation.creating">
   7         <title>Creating a New Index</title>
   8
   9         <para>
  10             Index creation and updating capabilities are implemented within the
  11             <classname>Zend_Search_Lucene</classname> component, as well as the Java Lucene project.
  12             You can use either of these options to create indexes that
  13             <classname>Zend_Search_Lucene</classname> can search.
  14         </para>
  15
  16         <para>
  17             The <acronym>PHP</acronym> code listing below provides an example of how to index a file
  18             using <classname>Zend_Search_Lucene</classname> indexing <acronym>API</acronym>:
  19         </para>
  20
  21         <programlisting language="php"><![CDATA[
  22 // Create index
  23 $index = Zend_Search_Lucene::create('/data/my-index');
  24
  25 $doc = new Zend_Search_Lucene_Document();
  26
  27 // Store document URL to identify it in the search results
  28 $doc->addField(Zend_Search_Lucene_Field::Text('url', $docUrl));
  29
  30 // Index document contents
  31 $doc->addField(Zend_Search_Lucene_Field::UnStored('contents', $docContent));
  32
  33 // Add document to the index
  34 $index->addDocument($doc);
  35 ]]></programlisting>
  36
  37         <para>
  38             Newly added documents are immediately searchable in the index.
  39         </para>
  40     </sect2>
  41
  42     <sect2 id="zend.search.lucene.index-creation.updating">
  43         <title>Updating Index</title>
  44
  45         <para>
  46             The same procedure is used to update an existing index. The only difference
  47             is that the open() method is called instead of the create() method:
  48         </para>
  49
  50         <programlisting language="php"><![CDATA[
  51 // Open existing index
  52 $index = Zend_Search_Lucene::open('/data/my-index');
  53
  54 $doc = new Zend_Search_Lucene_Document();
  55 // Store document URL to identify it in search result.
  56 $doc->addField(Zend_Search_Lucene_Field::Text('url', $docUrl));
  57 // Index document content
  58 $doc->addField(Zend_Search_Lucene_Field::UnStored('contents',
  59                                                   $docContent));
  60
  61 // Add document to the index.
  62 $index->addDocument($doc);
  63 ]]></programlisting>
  64     </sect2>
  65
  66     <sect2 id="zend.search.lucene.index-creation.document-updating">
  67         <title>Updating Documents</title>
  68
  69         <para>
  70             The Lucene index file format doesn't support document updating.
  71             Documents should be removed and re-added to the index to effectively update them.
  72         </para>
  73
  74         <para>
  75             <methodname>Zend_Search_Lucene::delete()</methodname> method operates with an internal
  76             index document id. It can be retrieved from a query hit by 'id' property:
  77         </para>
  78
  79         <programlisting language="php"><![CDATA[
  80 $removePath = ...;
  81 $hits = $index->find('path:' . $removePath);
  82 foreach ($hits as $hit) {
  83     $index->delete($hit->id);
  84 }
  85 ]]></programlisting>
  86     </sect2>
  87
  88     <sect2 id="zend.search.lucene.index-creation.counting">
  89         <title>Retrieving Index Size</title>
  90
  91         <para>
  92             There are two methods to retrieve the size of an index in
  93             <classname>Zend_Search_Lucene</classname>.
  94         </para>
  95
  96         <para>
  97              <methodname>Zend_Search_Lucene::maxDoc()</methodname> returns one greater than the
  98              largest possible document number. It's actually the overall number of the documents in
  99              the index including deleted documents, so it has a synonym:
 100              <methodname>Zend_Search_Lucene::count()</methodname>.
 101         </para>
 102
 103         <para>
 104              <methodname>Zend_Search_Lucene::numDocs()</methodname> returns the total number of
 105              non-deleted documents.
 106         </para>
 107
 108         <programlisting language="php"><![CDATA[
 109 $indexSize = $index->count();
 110 $documents = $index->numDocs();
 111 ]]></programlisting>
 112
 113         <para>
 114             <methodname>Zend_Search_Lucene::isDeleted($id)</methodname> method may be used to check
 115             if a document is deleted.
 116         </para>
 117
 118         <programlisting language="php"><![CDATA[
 119 for ($count = 0; $count < $index->maxDoc(); $count++) {
 120     if ($index->isDeleted($count)) {
 121         echo "Document #$id is deleted.\n";
 122     }
 123 }
 124 ]]></programlisting>
 125
 126         <para>
 127             Index optimization removes deleted documents and squeezes documents' IDs in to a smaller
 128             range. A document's internal id may therefore change during index optimization.
 129         </para>
 130     </sect2>
 131
 132     <sect2 id="zend.search.lucene.index-creation.optimization">
 133         <title>Index optimization</title>
 134
 135         <para>
 136             A Lucene index consists of many segments. Each segment is a completely independent set
 137             of data.
 138         </para>
 139
 140         <para>
 141             Lucene index segment files can't be updated by design. A segment update needs full
 142             segment reorganization. See Lucene index file formats for details (<ulink
 143                 url="http://lucene.apache.org/java/2_3_0/fileformats.html">http://lucene.apache.org/java/2_3_0/fileformats.html</ulink>)
 144
 145             <footnote>
 146                 <para>
 147                     The currently supported Lucene index file format is version 2.3 (starting from
 148                     Zend Framework 1.6).
 149                 </para>
 150             </footnote>.
 151
 152             New documents are added to the index by creating new segment.
 153         </para>
 154
 155         <para>
 156             Increasing number of segments reduces quality of the index, but index optimization
 157             restores it. Optimization essentially merges several segments into a new one. This
 158             process also doesn't update segments. It generates one new large segment and updates
 159             segment list ('segments' file).
 160         </para>
 161
 162         <para>
 163             Full index optimization can be trigger by calling the
 164             <methodname>Zend_Search_Lucene::optimize()</methodname> method. It merges all index
 165             segments into one new segment:
 166         </para>
 167
 168         <programlisting language="php"><![CDATA[
 169 // Open existing index
 170 $index = Zend_Search_Lucene::open('/data/my-index');
 171
 172 // Optimize index.
 173 $index->optimize();
 174 ]]></programlisting>
 175
 176         <para>
 177             Automatic index optimization is performed to keep indexes in a consistent state.
 178         </para>
 179
 180         <para>
 181             Automatic optimization is an iterative process managed by several index options. It
 182             merges very small segments into larger ones, then merges these larger segments into even
 183             larger segments and so on.
 184         </para>
 185
 186         <sect3 id="zend.search.lucene.index-creation.optimization.maxbuffereddocs">
 187             <title>MaxBufferedDocs auto-optimization option</title>
 188
 189             <para>
 190                 <emphasis>MaxBufferedDocs</emphasis> is a minimal number of documents required
 191                 before the buffered in-memory documents are written into a new segment.
 192             </para>
 193
 194             <para>
 195                 <emphasis>MaxBufferedDocs</emphasis> can be retrieved or set by
 196                 <code>$index->getMaxBufferedDocs()</code> or
 197                 <code>$index->setMaxBufferedDocs($maxBufferedDocs)</code> calls.
 198             </para>
 199
 200             <para>
 201                 Default value is 10.
 202             </para>
 203         </sect3>
 204
 205         <sect3 id="zend.search.lucene.index-creation.optimization.maxmergedocs">
 206             <title>MaxMergeDocs auto-optimization option</title>
 207
 208             <para>
 209                 <emphasis>MaxMergeDocs</emphasis> is a largest number of documents ever merged by
 210                 addDocument(). Small values (e.g., less than 10.000) are best for interactive
 211                 indexing, as this limits the length of pauses while indexing to a few seconds.
 212                 Larger values are best for batched indexing and speedier searches.
 213             </para>
 214
 215             <para>
 216                 <emphasis>MaxMergeDocs</emphasis> can be retrieved or set by
 217                 <code>$index->getMaxMergeDocs()</code> or
 218                 <code>$index->setMaxMergeDocs($maxMergeDocs)</code> calls.
 219             </para>
 220
 221             <para>
 222                 Default value is PHP_INT_MAX.
 223             </para>
 224         </sect3>
 225
 226         <sect3 id="zend.search.lucene.index-creation.optimization.mergefactor">
 227             <title>MergeFactor auto-optimization option</title>
 228
 229             <para>
 230                 <emphasis>MergeFactor</emphasis> determines how often segment indices are merged by
 231                 addDocument(). With smaller values, less <acronym>RAM</acronym> is used while
 232                 indexing, and searches on unoptimized indices are faster, but indexing speed is
 233                 slower. With larger values, more <acronym>RAM</acronym> is used during indexing, and
 234                 while searches on unoptimized indices are slower, indexing is faster. Thus larger
 235                 values (&gt; 10) are best for batch index creation, and smaller values (&lt; 10) for
 236                 indices that are interactively maintained.
 237             </para>
 238
 239             <para>
 240                 <emphasis>MergeFactor</emphasis> is a good estimation for average number of segments
 241                 merged by one auto-optimization pass. Too large values produce large number of
 242                 segments while they are not merged into new one. It may be a cause of "failed to
 243                 open stream: Too many open files" error message. This limitation is system
 244                 dependent.
 245             </para>
 246
 247             <para>
 248                 <emphasis>MergeFactor</emphasis> can be retrieved or set by
 249                 <code>$index->getMergeFactor()</code> or
 250                 <code>$index->setMergeFactor($mergeFactor)</code> calls.
 251             </para>
 252
 253             <para>
 254                 Default value is 10.
 255             </para>
 256
 257             <para>
 258                 Lucene Java and Luke (Lucene Index Toolbox - <ulink
 259                     url="http://www.getopt.org/luke/">http://www.getopt.org/luke/</ulink>) can also
 260                 be used to optimize an index. Latest Luke release (v0.8) is based on Lucene v2.3 and
 261                 compatible with current implementation of <classname>Zend_Search_Lucene</classname>
 262                 component (Zend Framework 1.6). Earlier versions of
 263                 <classname>Zend_Search_Lucene</classname> implementations need another versions of
 264                 Java Lucene tools to be compatible:
 265
 266                 <itemizedlist>
 267                     <listitem>
 268                         <para>
 269                             Zend Framework 1.5 - Java Lucene 2.1 (Luke tool v0.7.1 - <ulink
 270                                 url="http://www.getopt.org/luke/luke-0.7.1/"/>)
 271                         </para>
 272                     </listitem>
 273
 274                     <listitem>
 275                         <para>
 276                             Zend Framework 1.0 - Java Lucene 1.4 - 2.1 (Luke tool v0.6 - <ulink
 277                                 url="http://www.getopt.org/luke/luke-0.6/"/>)
 278                         </para>
 279                     </listitem>
 280                 </itemizedlist>
 281             </para>
 282         </sect3>
 283     </sect2>
 284
 285     <sect2 id="zend.search.lucene.index-creation.permissions">
 286         <title>Permissions</title>
 287
 288         <para>
 289             By default, index files are available for reading and writing by everyone.
 290         </para>
 291
 292         <para>
 293             It's possible to override this with the
 294             <methodname>Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions()</methodname>
 295             method:
 296         </para>
 297
 298         <programlisting language="php"><![CDATA[
 299 // Get current default file permissions
 300 $currentPermissions =
 301     Zend_Search_Lucene_Storage_Directory_Filesystem::getDefaultFilePermissions();
 302
 303 // Give read-writing permissions only for current user and group
 304 Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0660);
 305 ]]></programlisting>
 306     </sect2>
 307
 308     <sect2 id="zend.search.lucene.index-creation.limitations">
 309         <title>Limitations</title>
 310
 311         <sect3 id="zend.search.lucene.index-creation.limitations.index-size">
 312             <title>Index size</title>
 313
 314             <para>
 315                 Index size is limited by 2GB for 32-bit platforms.
 316             </para>
 317
 318             <para>
 319                 Use 64-bit platforms for larger indices.
 320             </para>
 321         </sect3>
 322
 323         <sect3 id="zend.search.lucene.index-creation.limitations.filesystems">
 324             <title>Supported Filesystems</title>
 325
 326             <para>
 327                 <classname>Zend_Search_Lucene</classname> uses <methodname>flock()</methodname> to
 328                 provide concurrent searching, index updating and optimization.
 329             </para>
 330
 331             <para>
 332                 According to the <acronym>PHP</acronym> <ulink
 333                     url="http://www.php.net/manual/en/function.flock.php">documentation</ulink>,
 334                 "<methodname>flock()</methodname> will not work on NFS and many other networked file
 335                 systems".
 336             </para>
 337
 338             <para>
 339                 Do not use networked file systems with <classname>Zend_Search_Lucene</classname>.
 340             </para>
 341         </sect3>
 342     </sect2>
 343 </sect1>
 344 <!--
 345 vim:se ts=4 sw=4 et:
 346 -->