Merge commit 'catalyst/MOODLE_19_STABLE' into mdl19-linuxchix
[moodle-linuxchix.git] / search / Zend / Search / Lucene / Index / Writer.php
blob8e32f4e1f6ed30ce62e43363d0925204b325f60a
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Index
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
23 /** Zend_Search_Lucene_Index_SegmentWriter_ */
24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
26 /** Zend_Search_Lucene_Index_SegmentInfo */
27 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
29 /** Zend_Search_Lucene_Index_SegmentMerger */
30 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentMerger.php';
34 /**
35 * @category Zend
36 * @package Zend_Search_Lucene
37 * @subpackage Index
38 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
39 * @license http://framework.zend.com/license/new-bsd New BSD License
41 class Zend_Search_Lucene_Index_Writer
43 /**
44 * @todo Implement Analyzer substitution
45 * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
46 * temporary index files
47 * @todo Directory lock processing
50 /**
51 * Number of documents required before the buffered in-memory
52 * documents are written into a new Segment
54 * Default value is 10
56 * @var integer
58 public $maxBufferedDocs = 10;
60 /**
61 * Largest number of documents ever merged by addDocument().
62 * Small values (e.g., less than 10,000) are best for interactive indexing,
63 * as this limits the length of pauses while indexing to a few seconds.
64 * Larger values are best for batched indexing and speedier searches.
66 * Default value is PHP_INT_MAX
68 * @var integer
70 public $maxMergeDocs = PHP_INT_MAX;
72 /**
73 * Determines how often segment indices are merged by addDocument().
75 * With smaller values, less RAM is used while indexing,
76 * and searches on unoptimized indices are faster,
77 * but indexing speed is slower.
79 * With larger values, more RAM is used during indexing,
80 * and while searches on unoptimized indices are slower,
81 * indexing is faster.
83 * Thus larger values (> 10) are best for batch index creation,
84 * and smaller values (< 10) for indices that are interactively maintained.
86 * Default value is 10
88 * @var integer
90 public $mergeFactor = 10;
92 /**
93 * File system adapter.
95 * @var Zend_Search_Lucene_Storage_Directory
97 private $_directory = null;
101 * Changes counter.
103 * @var integer
105 private $_versionUpdate = 0;
108 * List of the segments, created by index writer
109 * Array of Zend_Search_Lucene_Index_SegmentInfo objects
111 * @var array
113 private $_newSegments = array();
116 * List of segments to be deleted on commit
118 * @var array
120 private $_segmentsToDelete = array();
123 * Current segment to add documents
125 * @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
127 private $_currentSegment = null;
130 * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
132 * It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
134 * @var array Zend_Search_Lucene_Index_SegmentInfo
136 private $_segmentInfos;
139 * List of indexfiles extensions
141 * @var array
143 private static $_indexExtensions = array('.cfs' => '.cfs',
144 '.fnm' => '.fnm',
145 '.fdx' => '.fdx',
146 '.fdt' => '.fdt',
147 '.tis' => '.tis',
148 '.tii' => '.tii',
149 '.frq' => '.frq',
150 '.prx' => '.prx',
151 '.tvx' => '.tvx',
152 '.tvd' => '.tvd',
153 '.tvf' => '.tvf',
154 '.del' => '.del',
155 '.sti' => '.sti' );
158 * Opens the index for writing
160 * IndexWriter constructor needs Directory as a parameter. It should be
161 * a string with a path to the index folder or a Directory object.
162 * Second constructor parameter create is optional - true to create the
163 * index or overwrite the existing one.
165 * @param Zend_Search_Lucene_Storage_Directory $directory
166 * @param array $segmentInfos
167 * @param boolean $create
169 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $create = false)
171 $this->_directory = $directory;
172 $this->_segmentInfos = &$segmentInfos;
174 if ($create) {
175 foreach ($this->_directory->fileList() as $file) {
176 if ($file == 'deletable' ||
177 $file == 'segments' ||
178 isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
179 preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
180 $this->_directory->deleteFile($file);
183 $segmentsFile = $this->_directory->createFile('segments');
184 $segmentsFile->writeInt((int)0xFFFFFFFF);
186 // write version (is initialized by current time
187 // $segmentsFile->writeLong((int)microtime(true));
188 $version = microtime(true);
189 $segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
190 $segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
192 // write name counter
193 $segmentsFile->writeInt(0);
194 // write segment counter
195 $segmentsFile->writeInt(0);
197 $deletableFile = $this->_directory->createFile('deletable');
198 // write counter
199 $deletableFile->writeInt(0);
200 } else {
201 $segmentsFile = $this->_directory->getFileObject('segments');
202 $format = $segmentsFile->readInt();
203 if ($format != (int)0xFFFFFFFF) {
204 throw new Zend_Search_Lucene_Exception('Wrong segments file format');
210 * Adds a document to this index.
212 * @param Zend_Search_Lucene_Document $document
214 public function addDocument(Zend_Search_Lucene_Document $document)
216 if ($this->_currentSegment === null) {
217 $this->_currentSegment =
218 new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
220 $this->_currentSegment->addDocument($document);
222 if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
223 $this->commit();
226 $this->_versionUpdate++;
228 $this->_maybeMergeSegments();
233 * Merge segments if necessary
235 private function _maybeMergeSegments()
237 $segmentSizes = array();
238 foreach ($this->_segmentInfos as $segId => $segmentInfo) {
239 $segmentSizes[$segId] = $segmentInfo->count();
242 $mergePool = array();
243 $poolSize = 0;
244 $sizeToMerge = $this->maxBufferedDocs;
245 asort($segmentSizes, SORT_NUMERIC);
246 foreach ($segmentSizes as $segId => $size) {
247 // Check, if segment comes into a new merging block
248 while ($size >= $sizeToMerge) {
249 // Merge previous block if it's large enough
250 if ($poolSize >= $sizeToMerge) {
251 $this->_mergeSegments($mergePool);
253 $mergePool = array();
254 $poolSize = 0;
256 $sizeToMerge *= $this->mergeFactor;
258 if ($sizeToMerge > $this->maxMergeDocs) {
259 return;
263 $mergePool[] = $this->_segmentInfos[$segId];
264 $poolSize += $size;
267 if ($poolSize >= $sizeToMerge) {
268 $this->_mergeSegments($mergePool);
273 * Merge specified segments
275 * $segments is an array of SegmentInfo objects
277 * @param array $segments
279 private function _mergeSegments($segments)
281 // Try to get exclusive non-blocking lock to the 'index.optimization.lock'
282 // Skip optimization if it's performed by other process right now
283 $optimizationLock = $this->_directory->createFile('index.optimization.lock');
284 if (!$optimizationLock->lock(LOCK_EX,true)) {
285 return;
288 $newName = $this->_newSegmentName();
289 $merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
290 $newName);
291 foreach ($segments as $segmentInfo) {
292 $merger->addSource($segmentInfo);
293 $this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
296 $newSegment = $merger->merge();
297 if ($newSegment !== null) {
298 $this->_newSegments[$newSegment->getName()] = $newSegment;
301 $this->commit();
303 // optimization is finished
304 $optimizationLock->unlock();
308 * Update segments file by adding current segment to a list
310 * @throws Zend_Search_Lucene_Exception
312 private function _updateSegments()
314 // Get an exclusive index lock
315 // Wait, until all parallel searchers or indexers won't stop
316 // and stop all next searchers, while we are updating segments file
317 $lock = $this->_directory->getFileObject('index.lock');
318 if (!$lock->lock(LOCK_EX)) {
319 throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
323 // Do not share file handlers to get file updates from other sessions.
324 $segmentsFile = $this->_directory->getFileObject('segments', false);
325 $newSegmentFile = $this->_directory->createFile('segments.new', false);
327 // Write format marker
328 $newSegmentFile->writeInt((int)0xFFFFFFFF);
330 // Write index version
331 $segmentsFile->seek(4, SEEK_CUR);
332 // $version = $segmentsFile->readLong() + $this->_versionUpdate;
333 // Process version on 32-bit platforms
334 $versionHigh = $segmentsFile->readInt();
335 $versionLow = $segmentsFile->readInt();
336 $version = $versionHigh * ((double)0xFFFFFFFF + 1) +
337 (($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow);
338 $version += $this->_versionUpdate;
339 $this->_versionUpdate = 0;
340 $newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
341 $newSegmentFile->writeInt((int)($version & 0xFFFFFFFF));
343 // Write segment name counter
344 $newSegmentFile->writeInt($segmentsFile->readInt());
346 // Get number of segments offset
347 $numOfSegmentsOffset = $newSegmentFile->tell();
348 // Write number of segemnts
349 $segmentsCount = $segmentsFile->readInt();
350 $newSegmentFile->writeInt(0); // Write dummy data (segment counter)
352 $segments = array();
353 for ($count = 0; $count < $segmentsCount; $count++) {
354 $segName = $segmentsFile->readString();
355 $segSize = $segmentsFile->readInt();
357 if (!in_array($segName, $this->_segmentsToDelete)) {
358 $newSegmentFile->writeString($segName);
359 $newSegmentFile->writeInt($segSize);
361 $segments[$segName] = $segSize;
364 $segmentsFile->close();
366 $segmentsCount = count($segments) + count($this->_newSegments);
368 // Remove segments, not listed in $segments (deleted)
369 // Load segments, not listed in $this->_segmentInfos
370 foreach ($this->_segmentInfos as $segId => $segInfo) {
371 if (isset($segments[$segInfo->getName()])) {
372 // Segment is already included into $this->_segmentInfos
373 unset($segments[$segInfo->getName()]);
374 } else {
375 // remove deleted segment from a list
376 unset($this->_segmentInfos[$segId]);
379 // $segments contains a list of segments to load
380 // do it later
382 foreach ($this->_newSegments as $segName => $segmentInfo) {
383 $newSegmentFile->writeString($segName);
384 $newSegmentFile->writeInt($segmentInfo->count());
386 $this->_segmentInfos[] = $segmentInfo;
388 $this->_newSegments = array();
390 $newSegmentFile->seek($numOfSegmentsOffset);
391 $newSegmentFile->writeInt($segmentsCount); // Update segments count
392 $newSegmentFile->close();
393 $this->_directory->renameFile('segments.new', 'segments');
396 // Segments file update is finished
397 // Switch back to shared lock mode
398 $lock->lock(LOCK_SH);
401 $fileList = $this->_directory->fileList();
402 foreach ($this->_segmentsToDelete as $nameToDelete) {
403 foreach (self::$_indexExtensions as $ext) {
404 if ($this->_directory->fileExists($nameToDelete . $ext)) {
405 $this->_directory->deleteFile($nameToDelete . $ext);
409 foreach ($fileList as $file) {
410 if (substr($file, 0, strlen($nameToDelete) + 2) == ($nameToDelete . '.f') &&
411 ctype_digit( substr($file, strlen($nameToDelete) + 2) )) {
412 $this->_directory->deleteFile($file);
416 $this->_segmentsToDelete = array();
418 // Load segments, created by other process
419 foreach ($segments as $segName => $segSize) {
420 // Load new segments
421 $this->_segmentInfos[] = new Zend_Search_Lucene_Index_SegmentInfo($segName,
422 $segSize,
423 $this->_directory);
429 * Commit current changes
431 public function commit()
433 if ($this->_currentSegment !== null) {
434 $newSegment = $this->_currentSegment->close();
435 if ($newSegment !== null) {
436 $this->_newSegments[$newSegment->getName()] = $newSegment;
438 $this->_currentSegment = null;
441 if (count($this->_newSegments) != 0 ||
442 count($this->_segmentsToDelete) != 0) {
443 $this->_updateSegments();
449 * Merges the provided indexes into this index.
451 * @param array $readers
452 * @return void
454 public function addIndexes($readers)
457 * @todo implementation
462 * Merges all segments together into a single segment, optimizing
463 * an index for search.
464 * Input is an array of Zend_Search_Lucene_Index_SegmentInfo objects
466 * @throws Zend_Search_Lucene_Exception
468 public function optimize()
470 $this->_mergeSegments($this->_segmentInfos);
474 * Get name for new segment
476 * @return string
478 private function _newSegmentName()
480 // Do not share file handler to get file updates from other sessions.
481 $segmentsFile = $this->_directory->getFileObject('segments', false);
483 // Get exclusive segments file lock
484 // We have guarantee, that we will not intersect with _updateSegments() call
485 // of other process, because it needs exclusive index lock and waits
486 // until all other searchers won't stop
487 if (!$segmentsFile->lock(LOCK_EX)) {
488 throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
491 $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
492 $segmentNameCounter = $segmentsFile->readInt();
494 $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
495 $segmentsFile->writeInt($segmentNameCounter + 1);
497 // Flash output to guarantee that wrong value will not be loaded between unlock and
498 // return (which calls $segmentsFile destructor)
499 $segmentsFile->flush();
501 $segmentsFile->unlock();
503 return '_' . base_convert($segmentNameCounter, 10, 36);