adding some strings
[moodle-linuxchix.git] / search / Zend / Search / Lucene / Index / SegmentWriter.php
blob2f1a05e3229353212ea850f6e9eb6e686949b1d4
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Index
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
23 /** Zend_Search_Lucene_Exception */
24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
26 /** Zend_Search_Lucene_Index_SegmentInfo */
27 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/SegmentInfo.php';
30 /**
31 * @category Zend
32 * @package Zend_Search_Lucene
33 * @subpackage Index
34 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
35 * @license http://framework.zend.com/license/new-bsd New BSD License
37 abstract class Zend_Search_Lucene_Index_SegmentWriter
39 /**
40 * Expert: The fraction of terms in the "dictionary" which should be stored
41 * in RAM. Smaller values use more memory, but make searching slightly
42 * faster, while larger values use less memory and make searching slightly
43 * slower. Searching is typically not dominated by dictionary lookup, so
44 * tweaking this is rarely useful.
46 * @var integer
48 public static $indexInterval = 128;
50 /** Expert: The fraction of TermDocs entries stored in skip tables.
51 * Larger values result in smaller indexes, greater acceleration, but fewer
52 * accelerable cases, while smaller values result in bigger indexes,
53 * less acceleration and more
54 * accelerable cases. More detailed experiments would be useful here.
56 * 0x0x7FFFFFFF indicates that we don't use skip data
57 * Default value is 16
59 * @var integer
61 public static $skipInterval = 0x7FFFFFFF;
63 /**
64 * Number of docs in a segment
66 * @var integer
68 protected $_docCount = 0;
70 /**
71 * Segment name
73 * @var string
75 protected $_name;
77 /**
78 * File system adapter.
80 * @var Zend_Search_Lucene_Storage_Directory
82 protected $_directory;
84 /**
85 * List of the index files.
86 * Used for automatic compound file generation
88 * @var unknown_type
90 protected $_files = array();
92 /**
93 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
95 * @var array
97 protected $_fields = array();
99 /**
100 * Normalization factors.
101 * An array fieldName => normVector
102 * normVector is a binary string.
103 * Each byte corresponds to an indexed document in a segment and
104 * encodes normalization factor (float value, encoded by
105 * Zend_Search_Lucene_Search_Similarity::encodeNorm())
107 * @var array
109 protected $_norms = array();
113 * '.fdx' file - Stored Fields, the field index.
115 * @var Zend_Search_Lucene_Storage_File
117 protected $_fdxFile = null;
120 * '.fdt' file - Stored Fields, the field data.
122 * @var Zend_Search_Lucene_Storage_File
124 protected $_fdtFile = null;
128 * Object constructor.
130 * @param Zend_Search_Lucene_Storage_Directory $directory
131 * @param string $name
133 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
135 $this->_directory = $directory;
136 $this->_name = $name;
141 * Add field to the segment
143 * Returns actual field number
145 * @param Zend_Search_Lucene_Field $field
146 * @return integer
148 public function addField(Zend_Search_Lucene_Field $field)
150 if (!isset($this->_fields[$field->name])) {
151 $fieldNumber = count($this->_fields);
152 $this->_fields[$field->name] =
153 new Zend_Search_Lucene_Index_FieldInfo($field->name,
154 $field->isIndexed,
155 $fieldNumber,
156 $field->storeTermVector);
158 return $fieldNumber;
159 } else {
160 $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
161 $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
163 return $this->_fields[$field->name]->number;
168 * Add fieldInfo to the segment
170 * Returns actual field number
172 * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
173 * @return integer
175 public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
177 if (!isset($this->_fields[$fieldInfo->name])) {
178 $fieldNumber = count($this->_fields);
179 $this->_fields[$fieldInfo->name] =
180 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
181 $fieldInfo->isIndexed,
182 $fieldNumber,
183 $fieldInfo->storeTermVector);
185 return $fieldNumber;
186 } else {
187 $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
188 $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
190 return $this->_fields[$fieldInfo->name]->number;
195 * Returns array of FieldInfo objects.
197 * @return array
199 public function getFieldInfos()
201 return $this->_fields;
205 * Add stored fields information
207 * @param array $storedFields array of Zend_Search_Lucene_Field objects
209 public function addStoredFields($storedFields)
211 if (!isset($this->_fdxFile)) {
212 $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
213 $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
215 $this->_files[] = $this->_name . '.fdx';
216 $this->_files[] = $this->_name . '.fdt';
219 $this->_fdxFile->writeLong($this->_fdtFile->tell());
220 $this->_fdtFile->writeVInt(count($storedFields));
221 foreach ($storedFields as $field) {
222 $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
223 $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
224 ($field->isBinary ? 0x02 : 0x00) |
225 0x00; /* 0x04 - third bit, compressed (ZLIB) */
226 $this->_fdtFile->writeByte($fieldBits);
227 if ($field->isBinary) {
228 $this->_fdtFile->writeVInt(strlen($field->value));
229 $this->_fdtFile->writeBytes($field->value);
230 } else {
231 $this->_fdtFile->writeString($field->getUtf8Value());
235 $this->_docCount++;
239 * Returns the total number of documents in this segment.
241 * @return integer
243 public function count()
245 return $this->_docCount;
249 * Dump Field Info (.fnm) segment file
251 protected function _dumpFNM()
253 $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
254 $fnmFile->writeVInt(count($this->_fields));
256 foreach ($this->_fields as $field) {
257 $fnmFile->writeString($field->name);
258 $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
259 ($field->storeTermVector ? 0x02 : 0x00)
260 // not supported yet 0x04 /* term positions are stored with the term vectors */ |
261 // not supported yet 0x08 /* term offsets are stored with the term vectors */ |
264 if ($field->isIndexed) {
265 $normFileName = $this->_name . '.f' . $field->number;
266 $fFile = $this->_directory->createFile($normFileName);
267 $fFile->writeBytes($this->_norms[$field->name]);
268 $this->_files[] = $normFileName;
272 $this->_files[] = $this->_name . '.fnm';
278 * Term Dictionary file
280 * @var Zend_Search_Lucene_Storage_File
282 private $_tisFile = null;
285 * Term Dictionary index file
287 * @var Zend_Search_Lucene_Storage_File
289 private $_tiiFile = null;
292 * Frequencies file
294 * @var Zend_Search_Lucene_Storage_File
296 private $_frqFile = null;
299 * Positions file
301 * @var Zend_Search_Lucene_Storage_File
303 private $_prxFile = null;
306 * Number of written terms
308 * @var integer
310 private $_termCount;
314 * Last saved term
316 * @var Zend_Search_Lucene_Index_Term
318 private $_prevTerm;
321 * Last saved term info
323 * @var Zend_Search_Lucene_Index_TermInfo
325 private $_prevTermInfo;
328 * Last saved index term
330 * @var Zend_Search_Lucene_Index_Term
332 private $_prevIndexTerm;
335 * Last saved index term info
337 * @var Zend_Search_Lucene_Index_TermInfo
339 private $_prevIndexTermInfo;
342 * Last term dictionary file position
344 * @var integer
346 private $_lastIndexPosition;
349 * Create dicrionary, frequency and positions files and write necessary headers
351 public function initializeDictionaryFiles()
353 $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
354 $this->_tisFile->writeInt((int)0xFFFFFFFE);
355 $this->_tisFile->writeLong(0 /* dummy data for terms count */);
356 $this->_tisFile->writeInt(self::$indexInterval);
357 $this->_tisFile->writeInt(self::$skipInterval);
359 $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
360 $this->_tiiFile->writeInt((int)0xFFFFFFFE);
361 $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
362 $this->_tiiFile->writeInt(self::$indexInterval);
363 $this->_tiiFile->writeInt(self::$skipInterval);
365 /** Dump dictionary header */
366 $this->_tiiFile->writeVInt(0); // preffix length
367 $this->_tiiFile->writeString(''); // suffix
368 $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
369 $this->_tiiFile->writeByte((int)0x0F);
370 $this->_tiiFile->writeVInt(0); // DocFreq
371 $this->_tiiFile->writeVInt(0); // FreqDelta
372 $this->_tiiFile->writeVInt(0); // ProxDelta
373 $this->_tiiFile->writeVInt(20); // IndexDelta
375 $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
376 $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
378 $this->_files[] = $this->_name . '.tis';
379 $this->_files[] = $this->_name . '.tii';
380 $this->_files[] = $this->_name . '.frq';
381 $this->_files[] = $this->_name . '.prx';
383 $this->_prevTerm = null;
384 $this->_prevTermInfo = null;
385 $this->_prevIndexTerm = null;
386 $this->_prevIndexTermInfo = null;
387 $this->_lastIndexPosition = 20;
388 $this->_termCount = 0;
393 * Add term
395 * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
397 * @param Zend_Search_Lucene_Index_Term $termEntry
398 * @param array $termDocs
400 public function addTerm($termEntry, $termDocs)
402 $freqPointer = $this->_frqFile->tell();
403 $proxPointer = $this->_prxFile->tell();
405 $prevDoc = 0;
406 foreach ($termDocs as $docId => $termPositions) {
407 $docDelta = ($docId - $prevDoc)*2;
408 $prevDoc = $docId;
409 if (count($termPositions) > 1) {
410 $this->_frqFile->writeVInt($docDelta);
411 $this->_frqFile->writeVInt(count($termPositions));
412 } else {
413 $this->_frqFile->writeVInt($docDelta + 1);
416 $prevPosition = 0;
417 foreach ($termPositions as $position) {
418 $this->_prxFile->writeVInt($position - $prevPosition);
419 $prevPosition = $position;
423 if (count($termDocs) >= self::$skipInterval) {
425 * @todo Write Skip Data to a freq file.
426 * It's not used now, but make index more optimal
428 $skipOffset = $this->_frqFile->tell() - $freqPointer;
429 } else {
430 $skipOffset = 0;
433 $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
434 $this->_fields[$termEntry->field]->number);
435 $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
436 $freqPointer, $proxPointer, $skipOffset);
438 $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
440 if (($this->_termCount + 1) % self::$indexInterval == 0) {
441 $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
443 $indexPosition = $this->_tisFile->tell();
444 $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
445 $this->_lastIndexPosition = $indexPosition;
448 $this->_termCount++;
452 * Close dictionary
454 public function closeDictionaryFiles()
456 $this->_tisFile->seek(4);
457 $this->_tisFile->writeLong($this->_termCount);
459 $this->_tiiFile->seek(4);
460 $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval));
465 * Dump Term Dictionary segment file entry.
466 * Used to write entry to .tis or .tii files
468 * @param Zend_Search_Lucene_Storage_File $dicFile
469 * @param Zend_Search_Lucene_Index_Term $prevTerm
470 * @param Zend_Search_Lucene_Index_Term $term
471 * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
472 * @param Zend_Search_Lucene_Index_TermInfo $termInfo
474 protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
475 &$prevTerm, Zend_Search_Lucene_Index_Term $term,
476 &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
478 if (isset($prevTerm) && $prevTerm->field == $term->field) {
479 $matchedBytes = 0;
480 $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
481 while ($matchedBytes < $maxBytes &&
482 $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
483 $matchedBytes++;
486 // Calculate actual matched UTF-8 pattern
487 $prefixBytes = 0;
488 $prefixChars = 0;
489 while ($prefixBytes < $matchedBytes) {
490 $charBytes = 1;
491 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
492 $charBytes++;
493 if (ord($term->text[$prefixBytes]) & 0x20 ) {
494 $charBytes++;
495 if (ord($term->text[$prefixBytes]) & 0x10 ) {
496 $charBytes++;
501 if ($prefixBytes + $charBytes > $matchedBytes) {
502 // char crosses matched bytes boundary
503 // skip char
504 break;
507 $prefixChars++;
508 $prefixBytes += $charBytes;
511 // Write preffix length
512 $dicFile->writeVInt($prefixChars);
513 // Write suffix
514 $dicFile->writeString(substr($term->text, $prefixBytes));
515 } else {
516 // Write preffix length
517 $dicFile->writeVInt(0);
518 // Write suffix
519 $dicFile->writeString($term->text);
521 // Write field number
522 $dicFile->writeVInt($term->field);
523 // DocFreq (the count of documents which contain the term)
524 $dicFile->writeVInt($termInfo->docFreq);
526 $prevTerm = $term;
528 if (!isset($prevTermInfo)) {
529 // Write FreqDelta
530 $dicFile->writeVInt($termInfo->freqPointer);
531 // Write ProxDelta
532 $dicFile->writeVInt($termInfo->proxPointer);
533 } else {
534 // Write FreqDelta
535 $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
536 // Write ProxDelta
537 $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
539 // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
540 if ($termInfo->skipOffset != 0) {
541 $dicFile->writeVInt($termInfo->skipOffset);
544 $prevTermInfo = $termInfo;
549 * Generate compound index file
551 protected function _generateCFS()
553 $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
554 $cfsFile->writeVInt(count($this->_files));
556 $dataOffsetPointers = array();
557 foreach ($this->_files as $fileName) {
558 $dataOffsetPointers[$fileName] = $cfsFile->tell();
559 $cfsFile->writeLong(0); // write dummy data
560 $cfsFile->writeString($fileName);
563 foreach ($this->_files as $fileName) {
564 // Get actual data offset
565 $dataOffset = $cfsFile->tell();
566 // Seek to the data offset pointer
567 $cfsFile->seek($dataOffsetPointers[$fileName]);
568 // Write actual data offset value
569 $cfsFile->writeLong($dataOffset);
570 // Seek back to the end of file
571 $cfsFile->seek($dataOffset);
573 $dataFile = $this->_directory->getFileObject($fileName);
575 $byteCount = $this->_directory->fileLength($fileName);
576 while ($byteCount > 0) {
577 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
578 $byteCount -= strlen($data);
579 $cfsFile->writeBytes($data);
582 $this->_directory->deleteFile($fileName);
588 * Close segment, write it to disk and return segment info
590 * @return Zend_Search_Lucene_Index_SegmentInfo
592 abstract public function close();