7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
17 * @subpackage Analysis
18 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
23 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
24 require_once $CFG->dirroot
.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
29 * @package Zend_Search_Lucene
30 * @subpackage Analysis
31 * @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
32 * @license http://framework.zend.com/license/new-bsd New BSD License
35 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8
extends Zend_Search_Lucene_Analysis_Analyzer_Common
38 * Current char position in an UTF-8 stream
45 * Current binary position in an UTF-8 stream
49 private $_bytePosition;
56 private $_streamLength;
61 public function reset()
64 $this->_bytePosition
= 0;
66 // convert input into UTF-8
67 if (strcasecmp($this->_encoding
, 'utf8' ) != 0 &&
68 strcasecmp($this->_encoding
, 'utf-8') != 0 ) {
69 $this->_input
= iconv($this->_encoding
, 'UTF-8', $this->_input
);
70 $this->_encoding
= 'UTF-8';
73 // Get UTF-8 string length.
74 // It also checks if it's a correct utf-8 string
75 $this->_streamLength
= iconv_strlen($this->_input
, 'UTF-8');
79 * Check, that character is a letter
84 private static function _isAlpha($char)
86 if (strlen($char) > 1) {
87 // It's an UTF-8 character
91 return ctype_alpha($char);
100 private function _nextChar()
102 $char = $this->_input
[$this->_bytePosition++
];
104 if (( ord($char) & 0xC0 ) == 0xC0) {
106 if (ord($char) & 0x20 ) {
108 if (ord($char) & 0x10 ) {
112 $char .= substr($this->_input
, $this->_bytePosition
, $addBytes);
113 $this->_bytePosition +
= $addBytes;
122 * Tokenization stream API
124 * Returns null at the end of stream
126 * @return Zend_Search_Lucene_Analysis_Token|null
128 public function nextToken()
130 if ($this->_input
=== null) {
134 while ($this->_position
< $this->_streamLength
) {
136 while ($this->_position
< $this->_streamLength
&&
137 !self
::_isAlpha($char = $this->_nextChar())) {
141 $termStartPosition = $this->_position
- 1;
145 while ($this->_position
< $this->_streamLength
&&
146 self
::_isAlpha($char = $this->_nextChar())) {
150 // Empty token, end of stream.
151 if ($termText == '') {
155 $token = new Zend_Search_Lucene_Analysis_Token(
158 $this->_position
- 1);
159 $token = $this->normalize($token);
160 if ($token !== null) {
163 // Continue if token is skipped