*prechod na novsiu verziu ZF
[sport-group.git] / library / Zend / Search / Lucene / Index / DictionaryLoader.php
blobbc1a41cad26d68e7305043639f41313367d7a72b
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Index
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: DictionaryLoader.php 16971 2009-07-22 18:05:45Z mikaelkael $
23 /**
24 * Dictionary loader
26 * It's a dummy class which is created to encapsulate non-good structured code.
27 * Manual "method inlining" is performed to increase dictionary index loading operation
28 * which is major bottelneck for search performance.
31 * @category Zend
32 * @package Zend_Search_Lucene
33 * @subpackage Index
34 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
35 * @license http://framework.zend.com/license/new-bsd New BSD License
37 class Zend_Search_Lucene_Index_DictionaryLoader
39 /**
40 * Dictionary index loader.
42 * It takes a string which is actually <segment_name>.tii index file data and
43 * returns two arrays - term and tremInfo lists.
45 * See Zend_Search_Lucene_Index_SegmintInfo class for details
47 * @param string $data
48 * @return array
49 * @throws Zend_Search_Lucene_Exception
51 public static function load($data)
53 $termDictionary = array();
54 $termInfos = array();
55 $pos = 0;
57 // $tiVersion = $tiiFile->readInt();
58 $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
59 $pos += 4;
60 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
61 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
62 require_once 'Zend/Search/Lucene/Exception.php';
63 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
66 // $indexTermCount = $tiiFile->readLong();
67 if (PHP_INT_SIZE > 4) {
68 $indexTermCount = ord($data[$pos]) << 56 |
69 ord($data[$pos+1]) << 48 |
70 ord($data[$pos+2]) << 40 |
71 ord($data[$pos+3]) << 32 |
72 ord($data[$pos+4]) << 24 |
73 ord($data[$pos+5]) << 16 |
74 ord($data[$pos+6]) << 8 |
75 ord($data[$pos+7]);
76 } else {
77 if ((ord($data[$pos]) != 0) ||
78 (ord($data[$pos+1]) != 0) ||
79 (ord($data[$pos+2]) != 0) ||
80 (ord($data[$pos+3]) != 0) ||
81 ((ord($data[$pos+4]) & 0x80) != 0)) {
82 require_once 'Zend/Search/Lucene/Exception.php';
83 throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
86 $indexTermCount = ord($data[$pos+4]) << 24 |
87 ord($data[$pos+5]) << 16 |
88 ord($data[$pos+6]) << 8 |
89 ord($data[$pos+7]);
91 $pos += 8;
93 // $tiiFile->readInt(); // IndexInterval
94 $pos += 4;
96 // $skipInterval = $tiiFile->readInt();
97 $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
98 $pos += 4;
99 if ($indexTermCount < 1) {
100 require_once 'Zend/Search/Lucene/Exception.php';
101 throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
104 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
105 /* Skip MaxSkipLevels value */
106 $pos += 4;
109 $prevTerm = '';
110 $freqPointer = 0;
111 $proxPointer = 0;
112 $indexPointer = 0;
113 for ($count = 0; $count < $indexTermCount; $count++) {
114 //$termPrefixLength = $tiiFile->readVInt();
115 $nbyte = ord($data[$pos++]);
116 $termPrefixLength = $nbyte & 0x7F;
117 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
118 $nbyte = ord($data[$pos++]);
119 $termPrefixLength |= ($nbyte & 0x7F) << $shift;
122 // $termSuffix = $tiiFile->readString();
123 $nbyte = ord($data[$pos++]);
124 $len = $nbyte & 0x7F;
125 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
126 $nbyte = ord($data[$pos++]);
127 $len |= ($nbyte & 0x7F) << $shift;
129 if ($len == 0) {
130 $termSuffix = '';
131 } else {
132 $termSuffix = substr($data, $pos, $len);
133 $pos += $len;
134 for ($count1 = 0; $count1 < $len; $count1++ ) {
135 if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
136 $addBytes = 1;
137 if (ord($termSuffix[$count1]) & 0x20 ) {
138 $addBytes++;
140 // Never used for Java Lucene created index.
141 // Java2 doesn't encode strings in four bytes
142 if (ord($termSuffix[$count1]) & 0x10 ) {
143 $addBytes++;
146 $termSuffix .= substr($data, $pos, $addBytes);
147 $pos += $addBytes;
148 $len += $addBytes;
150 // Check for null character. Java2 encodes null character
151 // in two bytes.
152 if (ord($termSuffix[$count1]) == 0xC0 &&
153 ord($termSuffix[$count1+1]) == 0x80 ) {
154 $termSuffix[$count1] = 0;
155 $termSuffix = substr($termSuffix,0,$count1+1)
156 . substr($termSuffix,$count1+2);
158 $count1 += $addBytes;
163 // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
164 $pb = 0; $pc = 0;
165 while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
166 $charBytes = 1;
167 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
168 $charBytes++;
169 if (ord($prevTerm[$pb]) & 0x20 ) {
170 $charBytes++;
171 if (ord($prevTerm[$pb]) & 0x10 ) {
172 $charBytes++;
177 if ($pb + $charBytes > strlen($data)) {
178 // wrong character
179 break;
182 $pc++;
183 $pb += $charBytes;
185 $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
187 // $termFieldNum = $tiiFile->readVInt();
188 $nbyte = ord($data[$pos++]);
189 $termFieldNum = $nbyte & 0x7F;
190 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
191 $nbyte = ord($data[$pos++]);
192 $termFieldNum |= ($nbyte & 0x7F) << $shift;
195 // $docFreq = $tiiFile->readVInt();
196 $nbyte = ord($data[$pos++]);
197 $docFreq = $nbyte & 0x7F;
198 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
199 $nbyte = ord($data[$pos++]);
200 $docFreq |= ($nbyte & 0x7F) << $shift;
203 // $freqPointer += $tiiFile->readVInt();
204 $nbyte = ord($data[$pos++]);
205 $vint = $nbyte & 0x7F;
206 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
207 $nbyte = ord($data[$pos++]);
208 $vint |= ($nbyte & 0x7F) << $shift;
210 $freqPointer += $vint;
212 // $proxPointer += $tiiFile->readVInt();
213 $nbyte = ord($data[$pos++]);
214 $vint = $nbyte & 0x7F;
215 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
216 $nbyte = ord($data[$pos++]);
217 $vint |= ($nbyte & 0x7F) << $shift;
219 $proxPointer += $vint;
221 if( $docFreq >= $skipInterval ) {
222 // $skipDelta = $tiiFile->readVInt();
223 $nbyte = ord($data[$pos++]);
224 $vint = $nbyte & 0x7F;
225 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
226 $nbyte = ord($data[$pos++]);
227 $vint |= ($nbyte & 0x7F) << $shift;
229 $skipDelta = $vint;
230 } else {
231 $skipDelta = 0;
234 // $indexPointer += $tiiFile->readVInt();
235 $nbyte = ord($data[$pos++]);
236 $vint = $nbyte & 0x7F;
237 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
238 $nbyte = ord($data[$pos++]);
239 $vint |= ($nbyte & 0x7F) << $shift;
241 $indexPointer += $vint;
244 // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
245 $termDictionary[] = array($termFieldNum, $termValue);
247 $termInfos[] =
248 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
249 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
251 $prevTerm = $termValue;
254 // Check special index entry mark
255 if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
256 require_once 'Zend/Search/Lucene/Exception.php';
257 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
260 if (PHP_INT_SIZE > 4) {
261 // Treat 64-bit 0xFFFFFFFF as -1
262 $termDictionary[0][0] = -1;
265 return array($termDictionary, $termInfos);