adding some strings
[moodle-linuxchix.git] / search / Zend / Search / Lucene / Analysis / Analyzer / Common / Utf8.php
blob674a3d9e643871a6b263b3e367897dde21219f1c
1 <?php
2 /**
3 * Zend Framework
5 * LICENSE
7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
15 * @category Zend
16 * @package Zend_Search_Lucene
17 * @subpackage Analysis
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
23 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
27 /**
28 * @category Zend
29 * @package Zend_Search_Lucene
30 * @subpackage Analysis
31 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
32 * @license http://framework.zend.com/license/new-bsd New BSD License
35 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
37 /**
38 * Current char position in an UTF-8 stream
40 * @var integer
42 private $_position;
44 /**
45 * Current binary position in an UTF-8 stream
47 * @var integer
49 private $_bytePosition;
51 /**
52 * Stream length
54 * @var integer
56 private $_streamLength;
58 /**
59 * Reset token stream
61 public function reset()
63 $this->_position = 0;
64 $this->_bytePosition = 0;
66 // convert input into UTF-8
67 if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
68 strcasecmp($this->_encoding, 'utf-8') != 0 ) {
69 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
70 $this->_encoding = 'UTF-8';
73 // Get UTF-8 string length.
74 // It also checks if it's a correct utf-8 string
75 $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
78 /**
79 * Check, that character is a letter
81 * @param string $char
82 * @return boolean
84 private static function _isAlpha($char)
86 if (strlen($char) > 1) {
87 // It's an UTF-8 character
88 return true;
91 return ctype_alpha($char);
94 /**
95 * Get next UTF-8 char
97 * @param string $char
98 * @return boolean
100 private function _nextChar()
102 $char = $this->_input[$this->_bytePosition++];
104 if (( ord($char) & 0xC0 ) == 0xC0) {
105 $addBytes = 1;
106 if (ord($char) & 0x20 ) {
107 $addBytes++;
108 if (ord($char) & 0x10 ) {
109 $addBytes++;
112 $char .= substr($this->_input, $this->_bytePosition, $addBytes);
113 $this->_bytePosition += $addBytes;
116 $this->_position++;
118 return $char;
122 * Tokenization stream API
123 * Get next token
124 * Returns null at the end of stream
126 * @return Zend_Search_Lucene_Analysis_Token|null
128 public function nextToken()
130 if ($this->_input === null) {
131 return null;
134 while ($this->_position < $this->_streamLength) {
135 // skip white space
136 while ($this->_position < $this->_streamLength &&
137 !self::_isAlpha($char = $this->_nextChar())) {
138 $char = '';
141 $termStartPosition = $this->_position - 1;
142 $termText = $char;
144 // read token
145 while ($this->_position < $this->_streamLength &&
146 self::_isAlpha($char = $this->_nextChar())) {
147 $termText .= $char;
150 // Empty token, end of stream.
151 if ($termText == '') {
152 return null;
155 $token = new Zend_Search_Lucene_Analysis_Token(
156 $termText,
157 $termStartPosition,
158 $this->_position - 1);
159 $token = $this->normalize($token);
160 if ($token !== null) {
161 return $token;
163 // Continue if token is skipped
166 return null;