Merge commit 'catalyst/MOODLE_19_STABLE' into mdl19-linuxchix
[moodle-linuxchix.git] / search / Zend / Search / Lucene / Index / DictionaryLoader.php
blobd3f0669c814fb428980a6ea404ac5a9ba219b849
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Index
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
23 /** Zend_Search_Lucene_Exception */
24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Exception.php';
27 /**
28 * Dictionary loader
30 * It's a dummy class which is created to encapsulate non-good structured code.
31 * Manual "method inlining" is performed to increase dictionary index loading operation
32 * which is major bottelneck for search performance.
35 * @category Zend
36 * @package Zend_Search_Lucene
37 * @subpackage Index
38 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
39 * @license http://framework.zend.com/license/new-bsd New BSD License
41 class Zend_Search_Lucene_Index_DictionaryLoader
43 /**
44 * Dictionary index loader.
46 * It takes a string which is actually <segment_name>.tii index file data and
47 * returns two arrays - term and tremInfo lists.
49 * See Zend_Search_Lucene_Index_SegmintInfo class for details
51 * @param string $data
52 * @return array
53 * @throws Zend_Search_Lucene_Exception
55 public static function load($data)
57 $termDictionary = array();
58 $termInfos = array();
59 $pos = 0;
61 // $tiVersion = $tiiFile->readInt();
62 $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
63 $pos += 4;
64 if ($tiVersion != (int)0xFFFFFFFE) {
65 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
68 // $indexTermCount = = $tiiFile->readLong();
69 if (PHP_INT_SIZE > 4) {
70 $indexTermCount = ord($data[$pos]) << 56 |
71 ord($data[$pos+1]) << 48 |
72 ord($data[$pos+2]) << 40 |
73 ord($data[$pos+3]) << 32 |
74 ord($data[$pos+4]) << 24 |
75 ord($data[$pos+5]) << 16 |
76 ord($data[$pos+6]) << 8 |
77 ord($data[$pos+7]);
78 } else {
79 if ((ord($data[$pos]) != 0) ||
80 (ord($data[$pos+1]) != 0) ||
81 (ord($data[$pos+2]) != 0) ||
82 (ord($data[$pos+3]) != 0) ||
83 ((ord($data[$pos+4]) & 0x80) != 0)) {
84 throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
87 $indexTermCount = ord($data[$pos+4]) << 24 |
88 ord($data[$pos+5]) << 16 |
89 ord($data[$pos+6]) << 8 |
90 ord($data[$pos+7]);
92 $pos += 8;
94 // $tiiFile->readInt(); // IndexInterval
95 $pos += 4;
97 // $skipInterval = $tiiFile->readInt();
98 $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
99 $pos += 4;
100 if ($indexTermCount < 1) {
101 throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
104 $prevTerm = '';
105 $freqPointer = 0;
106 $proxPointer = 0;
107 $indexPointer = 0;
108 for ($count = 0; $count < $indexTermCount; $count++) {
109 //$termPrefixLength = $tiiFile->readVInt();
110 $nbyte = ord($data[$pos++]);
111 $termPrefixLength = $nbyte & 0x7F;
112 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
113 $nbyte = ord($data[$pos++]);
114 $termPrefixLength |= ($nbyte & 0x7F) << $shift;
117 // $termSuffix = $tiiFile->readString();
118 $nbyte = ord($data[$pos++]);
119 $len = $nbyte & 0x7F;
120 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
121 $nbyte = ord($data[$pos++]);
122 $len |= ($nbyte & 0x7F) << $shift;
124 if ($len == 0) {
125 $termSuffix = '';
126 } else {
127 $termSuffix = substr($data, $pos, $len);
128 $pos += $len;
129 for ($count1 = 0; $count1 < $len; $count1++ ) {
130 if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
131 $addBytes = 1;
132 if (ord($termSuffix[$count1]) & 0x20 ) {
133 $addBytes++;
135 $termSuffix .= substr($data, $pos, $addBytes);
136 $pos += $addBytes;
137 $len += $addBytes;
139 // Check for null character. Java2 encodes null character
140 // in two bytes.
141 if (ord($termSuffix[$count1]) == 0xC0 &&
142 ord($termSuffix[$count1+1]) == 0x80 ) {
143 $termSuffix[$count1] = 0;
144 $termSuffix = substr($termSuffix,0,$count1+1)
145 . substr($termSuffix,$count1+2);
147 $count1 += $addBytes;
152 // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
153 $pb = 0; $pc = 0;
154 while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
155 $charBytes = 1;
156 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
157 $charBytes++;
158 if (ord($prevTerm[$pb]) & 0x20 ) {
159 $charBytes++;
160 if (ord($prevTerm[$pb]) & 0x10 ) {
161 $charBytes++;
166 if ($pb + $charBytes > strlen($data)) {
167 // wrong character
168 break;
171 $pc++;
172 $pb += $charBytes;
174 $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
176 // $termFieldNum = $tiiFile->readVInt();
177 $nbyte = ord($data[$pos++]);
178 $termFieldNum = $nbyte & 0x7F;
179 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
180 $nbyte = ord($data[$pos++]);
181 $termFieldNum |= ($nbyte & 0x7F) << $shift;
184 // $docFreq = $tiiFile->readVInt();
185 $nbyte = ord($data[$pos++]);
186 $docFreq = $nbyte & 0x7F;
187 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
188 $nbyte = ord($data[$pos++]);
189 $docFreq |= ($nbyte & 0x7F) << $shift;
192 // $freqPointer += $tiiFile->readVInt();
193 $nbyte = ord($data[$pos++]);
194 $vint = $nbyte & 0x7F;
195 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
196 $nbyte = ord($data[$pos++]);
197 $vint |= ($nbyte & 0x7F) << $shift;
199 $freqPointer += $vint;
201 // $proxPointer += $tiiFile->readVInt();
202 $nbyte = ord($data[$pos++]);
203 $vint = $nbyte & 0x7F;
204 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
205 $nbyte = ord($data[$pos++]);
206 $vint |= ($nbyte & 0x7F) << $shift;
208 $proxPointer += $vint;
210 if( $docFreq >= $skipInterval ) {
211 // $skipDelta = $tiiFile->readVInt();
212 $nbyte = ord($data[$pos++]);
213 $vint = $nbyte & 0x7F;
214 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
215 $nbyte = ord($data[$pos++]);
216 $vint |= ($nbyte & 0x7F) << $shift;
218 $skipDelta = $vint;
219 } else {
220 $skipDelta = 0;
223 // $indexPointer += $tiiFile->readVInt();
224 $nbyte = ord($data[$pos++]);
225 $vint = $nbyte & 0x7F;
226 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
227 $nbyte = ord($data[$pos++]);
228 $vint |= ($nbyte & 0x7F) << $shift;
230 $indexPointer += $vint;
233 // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
234 $termDictionary[] = array($termFieldNum, $termValue);
236 $termInfos[] =
237 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
238 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
240 $prevTerm = $termValue;
243 // Check special index entry mark
244 if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
245 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
246 } else if (PHP_INT_SIZE > 4){
247 // Treat 64-bit 0xFFFFFFFF as -1
248 $termDictionary[0][0] = -1;
251 return array(&$termDictionary, &$termInfos);