search/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php

   1 <?php
   2 /**
   3  * Zend Framework
   4  *
   5  * LICENSE
   6  *
   7  * This source file is subject to the new BSD license that is bundled
   8  * with this package in the file LICENSE.txt.
   9  * It is also available through the world-wide-web at this URL:
  10  * http://framework.zend.com/license/new-bsd
  11  * If you did not receive a copy of the license and are unable to
  12  * obtain it through the world-wide-web, please send an email
  13  * to license@zend.com so we can send you a copy immediately.
  14  *
  15  * @category   Zend
  16  * @package    Zend_Search_Lucene
  17  * @subpackage Analysis
  18  * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20  */
  21
  22
  23 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
  24 require_once $CFG->dirroot.'/search/Zend/Search/Lucene/Analysis/Analyzer/Common.php';
  25
  26
  27 /**
  28  * @category   Zend
  29  * @package    Zend_Search_Lucene
  30  * @subpackage Analysis
  31  * @copyright  Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
  32  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  33  */
  34
  35 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
  36 {
  37     /**
  38      * Current char position in an UTF-8 stream
  39      *
  40      * @var integer
  41      */
  42     private $_position;
  43
  44     /**
  45      * Current binary position in an UTF-8 stream
  46      *
  47      * @var integer
  48      */
  49     private $_bytePosition;
  50
  51     /**
  52      * Stream length
  53      *
  54      * @var integer
  55      */
  56     private $_streamLength;
  57
  58     /**
  59      * Reset token stream
  60      */
  61     public function reset()
  62     {
  63         $this->_position     = 0;
  64         $this->_bytePosition = 0;
  65
  66         // convert input into UTF-8
  67         if (strcasecmp($this->_encoding, 'utf8' ) != 0  &&
  68             strcasecmp($this->_encoding, 'utf-8') != 0 ) {
  69                 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
  70                 $this->_encoding = 'UTF-8';
  71         }
  72
  73         // Get UTF-8 string length.
  74         // It also checks if it's a correct utf-8 string
  75         $this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
  76     }
  77
  78     /**
  79      * Check, that character is a letter
  80      *
  81      * @param string $char
  82      * @return boolean
  83      */
  84     private static function _isAlNum($char)
  85     {
  86         if (strlen($char) > 1) {
  87             // It's an UTF-8 character
  88             return true;
  89         }
  90
  91         return ctype_alnum($char);
  92     }
  93
  94     /**
  95      * Get next UTF-8 char
  96      *
  97      * @param string $char
  98      * @return boolean
  99      */
 100     private function _nextChar()
 101     {
 102         $char = $this->_input[$this->_bytePosition++];
 103
 104         if (( ord($char) & 0xC0 ) == 0xC0) {
 105             $addBytes = 1;
 106             if (ord($char) & 0x20 ) {
 107                 $addBytes++;
 108                 if (ord($char) & 0x10 ) {
 109                     $addBytes++;
 110                 }
 111             }
 112             $char .= substr($this->_input, $this->_bytePosition, $addBytes);
 113             $this->_bytePosition += $addBytes;
 114         }
 115
 116         $this->_position++;
 117
 118         return $char;
 119     }
 120
 121     /**
 122      * Tokenization stream API
 123      * Get next token
 124      * Returns null at the end of stream
 125      *
 126      * @return Zend_Search_Lucene_Analysis_Token|null
 127      */
 128     public function nextToken()
 129     {
 130         if ($this->_input === null) {
 131             return null;
 132         }
 133
 134         while ($this->_position < $this->_streamLength) {
 135             // skip white space
 136             while ($this->_position < $this->_streamLength &&
 137                    !self::_isAlNum($char = $this->_nextChar())) {
 138                 $char = '';
 139             }
 140
 141             $termStartPosition = $this->_position - 1;
 142             $termText = $char;
 143
 144             // read token
 145             while ($this->_position < $this->_streamLength &&
 146                    self::_isAlNum($char = $this->_nextChar())) {
 147                 $termText .= $char;
 148             }
 149
 150             // Empty token, end of stream.
 151             if ($termText == '') {
 152                 return null;
 153             }
 154
 155             $token = new Zend_Search_Lucene_Analysis_Token(
 156                                       $termText,
 157                                       $termStartPosition,
 158                                       $this->_position - 1);
 159             $token = $this->normalize($token);
 160             if ($token !== null) {
 161                 return $token;
 162             }
 163             // Continue if token is skipped
 164         }
 165
 166         return null;
 167     }
 168 }
 169