Merge commit 'catalyst/MOODLE_19_STABLE' into mdl19-linuxchix
[moodle-linuxchix.git] / search / Zend / Search / Lucene / Index / SegmentInfo.php
blobc6f7868fd509e15ce0851a08bcb5fc0ac11a8613
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Index
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
22 /** Zend_Search_Lucene_Index_DictionaryLoader */
23 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Index/DictionaryLoader.php';
26 /** Zend_Search_Lucene_Exception */
27 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
30 /**
31 * @category Zend
32 * @package Zend_Search_Lucene
33 * @subpackage Index
34 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
35 * @license http://framework.zend.com/license/new-bsd New BSD License
37 class Zend_Search_Lucene_Index_SegmentInfo
39 /**
40 * Number of docs in a segment
42 * @var integer
44 private $_docCount;
46 /**
47 * Segment name
49 * @var string
51 private $_name;
53 /**
54 * Term Dictionary Index
56 * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
57 * of performance considerations)
58 * [0] -> $termValue
59 * [1] -> $termFieldNum
61 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
63 * @var array
65 private $_termDictionary;
67 /**
68 * Term Dictionary Index TermInfos
70 * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
71 * of performance considerations)
72 * [0] -> $docFreq
73 * [1] -> $freqPointer
74 * [2] -> $proxPointer
75 * [3] -> $skipOffset
76 * [4] -> $indexPointer
78 * @var array
80 private $_termDictionaryInfos;
82 /**
83 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
85 * @var array
87 private $_fields;
89 /**
90 * Field positions in a dictionary.
91 * (Term dictionary contains filelds ordered by names)
93 * @var array
95 private $_fieldsDicPositions;
98 /**
99 * Associative array where the key is the file name and the value is data offset
100 * in a compound segment file (.csf).
102 * @var array
104 private $_segFiles;
107 * Associative array where the key is the file name and the value is file size (.csf).
109 * @var array
111 private $_segFileSizes;
115 * File system adapter.
117 * @var Zend_Search_Lucene_Storage_Directory_Filesystem
119 private $_directory;
122 * Normalization factors.
123 * An array fieldName => normVector
124 * normVector is a binary string.
125 * Each byte corresponds to an indexed document in a segment and
126 * encodes normalization factor (float value, encoded by
127 * Zend_Search_Lucene_Search_Similarity::encodeNorm())
129 * @var array
131 private $_norms = array();
134 * List of deleted documents.
135 * bitset if bitset extension is loaded or array otherwise.
137 * @var mixed
139 private $_deleted;
142 * $this->_deleted update flag
144 * @var boolean
146 private $_deletedDirty = false;
150 * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
151 * Documents count and Directory as a parameter.
153 * @param string $name
154 * @param integer $docCount
155 * @param Zend_Search_Lucene_Storage_Directory $directory
157 public function __construct($name, $docCount, $directory)
159 $this->_name = $name;
160 $this->_docCount = $docCount;
161 $this->_directory = $directory;
162 $this->_termDictionary = null;
164 $this->_segFiles = array();
165 if ($this->_directory->fileExists($name . '.cfs')) {
166 $cfsFile = $this->_directory->getFileObject($name . '.cfs');
167 $segFilesCount = $cfsFile->readVInt();
169 for ($count = 0; $count < $segFilesCount; $count++) {
170 $dataOffset = $cfsFile->readLong();
171 if ($count != 0) {
172 $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
174 $fileName = $cfsFile->readString();
175 $this->_segFiles[$fileName] = $dataOffset;
177 if ($count != 0) {
178 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
182 $fnmFile = $this->openCompoundFile('.fnm');
183 $fieldsCount = $fnmFile->readVInt();
184 $fieldNames = array();
185 $fieldNums = array();
186 $this->_fields = array();
187 for ($count=0; $count < $fieldsCount; $count++) {
188 $fieldName = $fnmFile->readString();
189 $fieldBits = $fnmFile->readByte();
190 $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
191 $fieldBits & 1,
192 $count,
193 $fieldBits & 2 );
194 if ($fieldBits & 0x10) {
195 // norms are omitted for the indexed field
196 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
199 $fieldNums[$count] = $count;
200 $fieldNames[$count] = $fieldName;
202 array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
203 $this->_fieldsDicPositions = array_flip($fieldNums);
205 try {
206 $delFile = $this->openCompoundFile('.del');
208 $byteCount = $delFile->readInt();
209 $byteCount = ceil($byteCount/8);
210 $bitCount = $delFile->readInt();
212 if ($bitCount == 0) {
213 $delBytes = '';
214 } else {
215 $delBytes = $delFile->readBytes($byteCount);
218 if (extension_loaded('bitset')) {
219 $this->_deleted = $delBytes;
220 } else {
221 $this->_deleted = array();
222 for ($count = 0; $count < $byteCount; $count++) {
223 $byte = ord($delBytes{$count});
224 for ($bit = 0; $bit < 8; $bit++) {
225 if ($byte & (1<<$bit)) {
226 $this->_deleted[$count*8 + $bit] = 1;
231 } catch(Zend_Search_Exception $e) {
232 if (strpos($e->getMessage(), 'compound file doesn\'t contain') !== false ) {
233 $this->_deleted = null;
234 } else {
235 throw $e;
241 * Opens index file stoted within compound index file
243 * @param string $extension
244 * @param boolean $shareHandler
245 * @throws Zend_Search_Lucene_Exception
246 * @return Zend_Search_Lucene_Storage_File
248 public function openCompoundFile($extension, $shareHandler = true)
250 $filename = $this->_name . $extension;
252 // Try to open common file first
253 if ($this->_directory->fileExists($filename)) {
254 return $this->_directory->getFileObject($filename, $shareHandler);
257 if( !isset($this->_segFiles[$filename]) ) {
258 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
259 . $filename . ' file.' );
262 $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
263 $file->seek($this->_segFiles[$filename]);
264 return $file;
268 * Get compound file length
270 * @param string $extension
271 * @return integer
273 public function compoundFileLength($extension)
275 $filename = $this->_name . $extension;
277 // Try to get common file first
278 if ($this->_directory->fileExists($filename)) {
279 return $this->_directory->fileLength($filename);
282 if( !isset($this->_segFileSizes[$filename]) ) {
283 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
284 . $filename . ' file.' );
287 return $this->_segFileSizes[$filename];
291 * Returns field index or -1 if field is not found
293 * @param string $fieldName
294 * @return integer
296 public function getFieldNum($fieldName)
298 foreach( $this->_fields as $field ) {
299 if( $field->name == $fieldName ) {
300 return $field->number;
304 return -1;
308 * Returns field info for specified field
310 * @param integer $fieldNum
311 * @return Zend_Search_Lucene_Index_FieldInfo
313 public function getField($fieldNum)
315 return $this->_fields[$fieldNum];
319 * Returns array of fields.
320 * if $indexed parameter is true, then returns only indexed fields.
322 * @param boolean $indexed
323 * @return array
325 public function getFields($indexed = false)
327 $result = array();
328 foreach( $this->_fields as $field ) {
329 if( (!$indexed) || $field->isIndexed ) {
330 $result[ $field->name ] = $field->name;
333 return $result;
337 * Returns array of FieldInfo objects.
339 * @return array
341 public function getFieldInfos()
343 return $this->_fields;
347 * Returns the total number of documents in this segment (including deleted documents).
349 * @return integer
351 public function count()
353 return $this->_docCount;
357 * Returns number of deleted documents.
359 * @return integer
361 private function _deletedCount()
363 if ($this->_deleted === null) {
364 return 0;
367 if (extension_loaded('bitset')) {
368 return count(bitset_to_array($this->_deleted));
369 } else {
370 return count($this->_deleted);
375 * Returns the total number of non-deleted documents in this segment.
377 * @return integer
379 public function numDocs()
381 if ($this->hasDeletions()) {
382 return $this->_docCount - $this->_deletedCount();
383 } else {
384 return $this->_docCount;
389 * Get field position in a fields dictionary
391 * @param integer $fieldNum
392 * @return integer
394 private function _getFieldPosition($fieldNum) {
395 // Treat values which are not in a translation table as a 'direct value'
396 return isset($this->_fieldsDicPositions[$fieldNum]) ?
397 $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
401 * Return segment name
403 * @return string
405 public function getName()
407 return $this->_name;
412 * TermInfo cache
414 * Size is 1024.
415 * Numbers are used instead of class constants because of performance considerations
417 * @var array
419 private $_termInfoCache = array();
421 private function _cleanUpTermInfoCache()
423 // Clean 256 term infos
424 foreach ($this->_termInfoCache as $key => $termInfo) {
425 unset($this->_termInfoCache[$key]);
427 // leave 768 last used term infos
428 if (count($this->_termInfoCache) == 768) {
429 break;
435 * Scans terms dictionary and returns term info
437 * @param Zend_Search_Lucene_Index_Term $term
438 * @return Zend_Search_Lucene_Index_TermInfo
440 public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
442 $termKey = $term->key();
443 if (isset($this->_termInfoCache[$termKey])) {
444 $termInfo = $this->_termInfoCache[$termKey];
446 // Move termInfo to the end of cache
447 unset($this->_termInfoCache[$termKey]);
448 $this->_termInfoCache[$termKey] = $termInfo;
450 return $termInfo;
454 if ($this->_termDictionary === null) {
455 // Check, if index is already serialized
456 if ($this->_directory->fileExists($this->_name . '.sti')) {
457 // Prefetch dictionary index data
458 $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
459 $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
461 // Load dictionary index data
462 list($this->_termDictionary, $this->_termDictionaryInfos) = unserialize($stiFileData);
463 } else {
464 // Prefetch dictionary index data
465 $tiiFile = $this->openCompoundFile('.tii');
466 $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
468 // Load dictionary index data
469 list($this->_termDictionary, $this->_termDictionaryInfos) =
470 Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
472 $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
473 $stiFile = $this->_directory->createFile($this->_name . '.sti');
474 $stiFile->writeBytes($stiFileData);
481 $searchField = $this->getFieldNum($term->field);
483 if ($searchField == -1) {
484 return null;
486 $searchDicField = $this->_getFieldPosition($searchField);
488 // search for appropriate value in dictionary
489 $lowIndex = 0;
490 $highIndex = count($this->_termDictionary)-1;
491 while ($highIndex >= $lowIndex) {
492 // $mid = ($highIndex - $lowIndex)/2;
493 $mid = ($highIndex + $lowIndex) >> 1;
494 $midTerm = $this->_termDictionary[$mid];
496 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
497 $delta = $searchDicField - $fieldNum;
498 if ($delta == 0) {
499 $delta = strcmp($term->text, $midTerm[1] /* text */);
502 if ($delta < 0) {
503 $highIndex = $mid-1;
504 } elseif ($delta > 0) {
505 $lowIndex = $mid+1;
506 } else {
507 // return $this->_termDictionaryInfos[$mid]; // We got it!
508 $a = $this->_termDictionaryInfos[$mid];
509 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
511 // Put loaded termInfo into cache
512 $this->_termInfoCache[$termKey] = $termInfo;
514 return $termInfo;
518 if ($highIndex == -1) {
519 // Term is out of the dictionary range
520 return null;
523 $prevPosition = $highIndex;
524 $prevTerm = $this->_termDictionary[$prevPosition];
525 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
527 $tisFile = $this->openCompoundFile('.tis');
528 $tiVersion = $tisFile->readInt();
529 if ($tiVersion != (int)0xFFFFFFFE) {
530 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
533 $termCount = $tisFile->readLong();
534 $indexInterval = $tisFile->readInt();
535 $skipInterval = $tisFile->readInt();
537 $tisFile->seek($prevTermInfo[4] /* indexPointer */ - 20 /* header size*/, SEEK_CUR);
539 $termValue = $prevTerm[1] /* text */;
540 $termFieldNum = $prevTerm[0] /* field */;
541 $freqPointer = $prevTermInfo[1] /* freqPointer */;
542 $proxPointer = $prevTermInfo[2] /* proxPointer */;
543 for ($count = $prevPosition*$indexInterval + 1;
544 $count <= $termCount &&
545 ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
546 ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
547 strcmp($termValue, $term->text) < 0) );
548 $count++) {
549 $termPrefixLength = $tisFile->readVInt();
550 $termSuffix = $tisFile->readString();
551 $termFieldNum = $tisFile->readVInt();
552 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
554 $docFreq = $tisFile->readVInt();
555 $freqPointer += $tisFile->readVInt();
556 $proxPointer += $tisFile->readVInt();
557 if( $docFreq >= $skipInterval ) {
558 $skipOffset = $tisFile->readVInt();
559 } else {
560 $skipOffset = 0;
564 if ($termFieldNum == $searchField && $termValue == $term->text) {
565 $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
566 } else {
567 $termInfo = null;
570 // Put loaded termInfo into cache
571 $this->_termInfoCache[$termKey] = $termInfo;
573 if (count($this->_termInfoCache) == 1024) {
574 $this->_cleanUpTermInfoCache();
577 return $termInfo;
581 * Returns term freqs array.
582 * Result array structure: array(docId => freq, ...)
584 * @param Zend_Search_Lucene_Index_Term $term
585 * @param integer $shift
586 * @return Zend_Search_Lucene_Index_TermInfo
588 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0)
590 $termInfo = $this->getTermInfo($term);
592 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
593 return array();
596 $frqFile = $this->openCompoundFile('.frq');
597 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
598 $result = array();
599 $docId = 0;
601 for ($count = 0; $count < $termInfo->docFreq; $count++) {
602 $docDelta = $frqFile->readVInt();
603 if ($docDelta % 2 == 1) {
604 $docId += ($docDelta-1)/2;
605 $result[$shift + $docId] = 1;
606 } else {
607 $docId += $docDelta/2;
608 $result[$shift + $docId] = $frqFile->readVInt();
612 return $result;
616 * Returns term positions array.
617 * Result array structure: array(docId => array(pos1, pos2, ...), ...)
619 * @param Zend_Search_Lucene_Index_Term $term
620 * @param integer $shift
621 * @return Zend_Search_Lucene_Index_TermInfo
623 public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0)
625 $termInfo = $this->getTermInfo($term);
627 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
628 return array();
631 $frqFile = $this->openCompoundFile('.frq');
632 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
633 $freqs = array();
634 $docId = 0;
636 for ($count = 0; $count < $termInfo->docFreq; $count++) {
637 $docDelta = $frqFile->readVInt();
638 if ($docDelta % 2 == 1) {
639 $docId += ($docDelta-1)/2;
640 $freqs[$docId] = 1;
641 } else {
642 $docId += $docDelta/2;
643 $freqs[$docId] = $frqFile->readVInt();
647 $result = array();
648 $prxFile = $this->openCompoundFile('.prx');
649 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
650 foreach ($freqs as $docId => $freq) {
651 $termPosition = 0;
652 $positions = array();
654 for ($count = 0; $count < $freq; $count++ ) {
655 $termPosition += $prxFile->readVInt();
656 $positions[] = $termPosition;
659 $result[$shift + $docId] = $positions;
662 return $result;
666 * Load normalizatin factors from an index file
668 * @param integer $fieldNum
670 private function _loadNorm($fieldNum)
672 $fFile = $this->openCompoundFile('.f' . $fieldNum);
673 $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
677 * Returns normalization factor for specified documents
679 * @param integer $id
680 * @param string $fieldName
681 * @return float
683 public function norm($id, $fieldName)
685 $fieldNum = $this->getFieldNum($fieldName);
687 if ( !($this->_fields[$fieldNum]->isIndexed) ) {
688 return null;
691 if (!isset($this->_norms[$fieldNum])) {
692 $this->_loadNorm($fieldNum);
695 return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
699 * Returns norm vector, encoded in a byte string
701 * @param string $fieldName
702 * @return string
704 public function normVector($fieldName)
706 $fieldNum = $this->getFieldNum($fieldName);
708 if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
709 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
711 return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
712 $this->_docCount);
715 if (!isset($this->_norms[$fieldNum])) {
716 $this->_loadNorm($fieldNum);
719 return $this->_norms[$fieldNum];
724 * Returns true if any documents have been deleted from this index segment.
726 * @return boolean
728 public function hasDeletions()
730 return $this->_deleted !== null;
735 * Deletes a document from the index segment.
736 * $id is an internal document id
738 * @param integer
740 public function delete($id)
742 $this->_deletedDirty = true;
744 if (extension_loaded('bitset')) {
745 if ($this->_deleted === null) {
746 $this->_deleted = bitset_empty($id);
748 bitset_incl($this->_deleted, $id);
749 } else {
750 if ($this->_deleted === null) {
751 $this->_deleted = array();
754 $this->_deleted[$id] = 1;
759 * Checks, that document is deleted
761 * @param integer
762 * @return boolean
764 public function isDeleted($id)
766 if ($this->_deleted === null) {
767 return false;
770 if (extension_loaded('bitset')) {
771 return bitset_in($this->_deleted, $id);
772 } else {
773 return isset($this->_deleted[$id]);
779 * Write changes if it's necessary.
781 public function writeChanges()
783 if (!$this->_deletedDirty) {
784 return;
787 if (extension_loaded('bitset')) {
788 $delBytes = $this->_deleted;
789 $bitCount = count(bitset_to_array($delBytes));
790 } else {
791 $byteCount = floor($this->_docCount/8)+1;
792 $delBytes = str_repeat(chr(0), $byteCount);
793 for ($count = 0; $count < $byteCount; $count++) {
794 $byte = 0;
795 for ($bit = 0; $bit < 8; $bit++) {
796 if (isset($this->_deleted[$count*8 + $bit])) {
797 $byte |= (1<<$bit);
800 $delBytes{$count} = chr($byte);
802 $bitCount = count($this->_deleted);
806 $delFile = $this->_directory->createFile($this->_name . '.del');
807 $delFile->writeInt($this->_docCount);
808 $delFile->writeInt($bitCount);
809 $delFile->writeBytes($delBytes);
811 $this->_deletedDirty = false;
817 * Term Dictionary File object for stream like terms reading
819 * @var Zend_Search_Lucene_Storage_File
821 private $_tisFile = null;
824 * Frequencies File object for stream like terms reading
826 * @var Zend_Search_Lucene_Storage_File
828 private $_frqFile = null;
831 * Offset of the .frq file in the compound file
833 * @var integer
835 private $_frqFileOffset;
838 * Positions File object for stream like terms reading
840 * @var Zend_Search_Lucene_Storage_File
842 private $_prxFile = null;
845 * Offset of the .prx file in the compound file
847 * @var integer
849 private $_prxFileOffset;
853 * Number of terms in term stream
855 * @var integer
857 private $_termCount = 0;
860 * Segment skip interval
862 * @var integer
864 private $_skipInterval;
867 * Last TermInfo in a terms stream
869 * @var Zend_Search_Lucene_Index_TermInfo
871 private $_lastTermInfo = null;
874 * Last Term in a terms stream
876 * @var Zend_Search_Lucene_Index_Term
878 private $_lastTerm = null;
881 * Map of the document IDs
882 * Used to get new docID after removing deleted documents.
883 * It's not very effective from memory usage point of view,
884 * but much more faster, then other methods
886 * @var array|null
888 private $_docMap = null;
891 * An array of all term positions in the documents.
892 * Array structure: array( docId => array( pos1, pos2, ...), ...)
894 * @var array
896 private $_lastTermPositions;
899 * Reset terms stream
901 * $startId - id for the fist document
902 * $compact - remove deleted documents
904 * Returns start document id for the next segment
906 * @param integer $startId
907 * @param boolean $compact
908 * @throws Zend_Search_Lucene_Exception
909 * @return integer
911 public function reset($startId = 0, $compact = false)
913 if ($this->_tisFile !== null) {
914 $this->_tisFile = null;
917 $this->_tisFile = $this->openCompoundFile('.tis', false);
918 $tiVersion = $this->_tisFile->readInt();
919 if ($tiVersion != (int)0xFFFFFFFE) {
920 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
923 $this->_termCount = $this->_tisFile->readLong();
924 $this->_tisFile->readInt(); // Read Index interval
925 $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
927 if ($this->_frqFile !== null) {
928 $this->_frqFile = null;
930 $this->_frqFile = $this->openCompoundFile('.frq', false);
931 $this->_frqFileOffset = $this->_frqFile->tell();
933 if ($this->_prxFile !== null) {
934 $this->_prxFile = null;
936 $this->_prxFile = $this->openCompoundFile('.prx', false);
937 $this->_prxFileOffset = $this->_prxFile->tell();
939 $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
940 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
942 $this->_docMap = array();
943 for ($count = 0; $count < $this->_docCount; $count++) {
944 if (!$this->isDeleted($count)) {
945 $this->_docMap[$count] = $startId + ($compact ? count($this->_docMap) : $count);
949 $this->nextTerm();
950 return $startId + ($compact ? count($this->_docMap) : $this->_docCount);
955 * Scans terms dictionary and returns next term
957 * @return Zend_Search_Lucene_Index_Term|null
959 public function nextTerm()
961 if ($this->_tisFile === null || $this->_termCount == 0) {
962 $this->_lastTerm = null;
963 $this->_lastTermInfo = null;
965 // may be necessary for "empty" segment
966 $this->_tisFile = null;
967 $this->_frqFile = null;
968 $this->_prxFile = null;
970 return null;
973 $termPrefixLength = $this->_tisFile->readVInt();
974 $termSuffix = $this->_tisFile->readString();
975 $termFieldNum = $this->_tisFile->readVInt();
976 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
978 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
980 $docFreq = $this->_tisFile->readVInt();
981 $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
982 $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
983 if ($docFreq >= $this->_skipInterval) {
984 $skipOffset = $this->_tisFile->readVInt();
985 } else {
986 $skipOffset = 0;
989 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
992 $this->_lastTermPositions = array();
994 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
995 $freqs = array(); $docId = 0;
996 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
997 $docDelta = $this->_frqFile->readVInt();
998 if( $docDelta % 2 == 1 ) {
999 $docId += ($docDelta-1)/2;
1000 $freqs[ $docId ] = 1;
1001 } else {
1002 $docId += $docDelta/2;
1003 $freqs[ $docId ] = $this->_frqFile->readVInt();
1007 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
1008 foreach ($freqs as $docId => $freq) {
1009 $termPosition = 0; $positions = array();
1011 for ($count = 0; $count < $freq; $count++ ) {
1012 $termPosition += $this->_prxFile->readVInt();
1013 $positions[] = $termPosition;
1016 if (isset($this->_docMap[$docId])) {
1017 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
1022 $this->_termCount--;
1023 if ($this->_termCount == 0) {
1024 $this->_tisFile = null;
1025 $this->_frqFile = null;
1026 $this->_prxFile = null;
1029 return $this->_lastTerm;
1034 * Returns term in current position
1036 * @param Zend_Search_Lucene_Index_Term $term
1037 * @return Zend_Search_Lucene_Index_Term|null
1039 public function currentTerm()
1041 return $this->_lastTerm;
1046 * Returns an array of all term positions in the documents.
1047 * Return array structure: array( docId => array( pos1, pos2, ...), ...)
1049 * @return array
1051 public function currentTermPositions()
1053 return $this->_lastTermPositions;