includes/Tokenizer.php

   1 <?php
   2 /**
   3  *
   4  * @package MediaWiki
   5  */
   6
   7 /**
   8  *
   9  * @package MediaWiki
  10  */
  11 class Tokenizer {
  12         /* private */ var $mText,               # Text to be processed by the tokenizer
  13                           $mPos,                # current position of tokenizer in text
  14                           $mTextLength,         # Length of $mText
  15                           $mQueuedToken;        # Tokens that were already found, but not
  16                                                 # returned yet.
  17
  18         /**
  19          * Constructor
  20          * @access private
  21          */
  22         function Tokenizer() {
  23                 global $wgLang;
  24
  25                 $this->mPos=0;
  26                 $this->mTokenQueue=array();
  27                 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
  28         }
  29
  30         /**
  31          * factory function
  32          */
  33         function newFromString( $s ) {
  34                 $fname = 'Tokenizer::newFromString';
  35                 wfProfileIn( $fname );
  36
  37                 $t = new Tokenizer();
  38                 $t->mText = $s;
  39                 $t->mTextLength = strlen( $s );
  40
  41                 wfProfileOut( $fname );
  42                 return $t;
  43         }
  44
  45
  46         /**
  47          * Return the next token, but do not increase the pointer. The next call
  48          * to previewToken or nextToken will return the same token again.
  49          * Actually, the pointer is increased, but the token is queued. The next
  50          * call to previewToken or nextToken will check the queue and return
  51          * the stored token.
  52          */
  53         function previewToken() {
  54                 $fname = 'Tokenizer::previewToken';
  55                 wfProfileIn( $fname );
  56
  57                 if ( count( $this->mQueuedToken ) != 0 ) {
  58                         // still one token from the last round around. Return that one first.
  59                         $token = $this->mQueuedToken[0];
  60                 } else {
  61                         $token = $this->nextToken();
  62                         array_unshift( $this->mQueuedToken, $token );
  63                 }
  64
  65                 wfProfileOut( $fname );
  66                 return $token;
  67         }
  68
  69
  70         /**
  71          * Get the next token.
  72          *
  73          * proceeds character by character through the text, looking for characters needing
  74          * special attention. Those are currently: I, R, ', [, ], newline
  75          *
  76          * @todo handling of French blanks not yet implemented
  77          */
  78         function nextToken() {
  79                 $fname = 'Tokenizer::nextToken';
  80                 wfProfileIn( $fname );
  81
  82                 if ( count( $this->mQueuedToken ) != 0 ) {
  83                         // still one token from the last round around. Return that one first.
  84                         $token = array_shift( $this->mQueuedToken );
  85                 } else if ( $this->mPos > $this->mTextLength ) {
  86                         // If no text is left, return 'false'.
  87                         $token = false;
  88                 } else {
  89
  90                         $token['text']='';
  91                         $token['type']='text';
  92
  93                         while ( $this->mPos <= $this->mTextLength ) {
  94                                 switch ( @$ch = $this->mText[$this->mPos] ) {
  95                                         case 'R': // for "RFC "
  96                                                 if ( $this->continues('FC ') ) {
  97                                                         $queueToken['type'] = $queueToken['text'] = 'RFC ';
  98                                                         $this->mQueuedToken[] = $queueToken;
  99                                                         $this->mPos += 3;
 100                                                         break 2; // switch + while
 101                                                 }
 102                                                 break;
 103                                         case 'I': // for "ISBN "
 104                                                 if ( $this->continues('SBN ') ) {
 105                                                         $queueToken['type'] = $queueToken['text'] = 'ISBN ';
 106                                                         $this->mQueuedToken[] = $queueToken;
 107                                                         $this->mPos += 4;
 108                                                         break 2; // switch + while
 109                                                 }
 110                                                 break;
 111                                         case '[': // for links "[["
 112                                                 if ( $this->continues('[[') ) {
 113                                                         $queueToken['type'] = '[[[';
 114                                                         $queueToken['text'] = '';
 115                                                         $this->mQueuedToken[] = $queueToken;
 116                                                         $this->mPos += 3;
 117                                                         break 2; // switch + while
 118                                                 } else if ( $this->continues('[') ) {
 119                                                         $queueToken['type'] = '[[';
 120                                                         $queueToken['text'] = '';
 121                                                         // Check for a "prefixed link", e.g. Al[[Khazar]]
 122                                                         // Mostly for arabic wikipedia
 123                                                         if ( $this->linkPrefixExtension ) {
 124                                                                 while (    $this->linkPrefixExtension
 125                                                                         && ($len = strlen( $token['text'] ) ) > 0
 126                                                                         && !ctype_space( $token['text'][$len-1] ) )
 127                                                                 {
 128                                                                         //prepend the character to the link's open tag
 129                                                                         $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
 130                                                                         //remove character from the end of the text token
 131                                                                         $token['text'] = substr( $token['text'], 0, -1);
 132                                                                 }
 133                                                         }
 134                                                         $this->mQueuedToken[] = $queueToken;
 135                                                         $this->mPos += 2;
 136                                                         break 2; // switch + while
 137                                                 }
 138                                                 break;
 139                                         case ']': // for end of links "]]"
 140                                                 if ( $this->continues(']') ) {
 141                                                         $queueToken['type'] = ']]';
 142                                                         $queueToken['text'] = '';
 143                                                         $this->mQueuedToken[] = $queueToken;
 144                                                         $this->mPos += 2;
 145                                                         break 2; // switch + while
 146                                                 }
 147                                                 break;
 148                                         case "'": // for all kind of em's and strong's
 149                                                 if ( $this->continues("'") ) {
 150                                                         $queueToken['type'] = "'";
 151                                                         $queueToken['text'] = '';
 152                                                         while(   ($this->mPos+1 < $this->mTextLength)
 153                                                                && $this->mText[$this->mPos+1] == "'" )
 154                                                         {
 155                                                                 $queueToken['type'] .= "'";
 156                                                                 $queueToken['pos'] = $this->mPos;
 157                                                                 $this->mPos ++;
 158                                                         }
 159
 160                                                         $this->mQueuedToken[] = $queueToken;
 161                                                         $this->mPos ++;
 162                                                         break 2; // switch + while
 163                                                 }
 164                                                 break;
 165                                         case "\n": // for block levels, actually, only "----" is handled.
 166                                         case "\r": // headings are detected to close any unbalanced em or strong tags in a section
 167                                                 if ( $this->continues( '----' ) )
 168                                                 {
 169                                                         $queueToken['type'] = '----';
 170                                                         $queueToken['text'] = '';
 171                                                         $this->mQueuedToken[] = $queueToken;
 172                                                         $this->mPos += 5;
 173                                                         while (     $this->mPos<$this->mTextLength
 174                                                                 and $this->mText[$this->mPos] == '-' )
 175                                                         {
 176                                                                 $this->mPos ++;
 177                                                         }
 178                                                         break 2;
 179                                                 } else if (
 180                                                         $this->continues( '<h' ) and (
 181                                                                 $this->continues( '<h1' ) or
 182                                                                 $this->continues( '<h2' ) or
 183                                                                 $this->continues( '<h3' ) or
 184                                                                 $this->continues( '<h4' ) or
 185                                                                 $this->continues( '<h5' ) or
 186                                                                 $this->continues( '<h6' )
 187                                                         )
 188                                                 ) { // heading
 189                                                         $queueToken['type'] = 'h';
 190                                                         $queueToken['text'] = '';
 191                                                         $this->mQueuedToken[] = $queueToken;
 192                                                         $this->mPos ++;
 193                                                         break 2; // switch + while
 194                                                 }
 195                                                 break;
 196                                         case '!': // French spacing rules have a space before exclamation
 197                                         case '?': // and question marks. Those have to become &nbsp;
 198                                         case ':': // And colons, Hashar says ...
 199                                                 if ( $this->preceeded( ' ' ) )
 200                                                 {
 201                                                         // strip blank from Token
 202                                                         $token['text'] = substr( $token['text'], 0, -1 );
 203                                                         $queueToken['type'] = 'blank';
 204                                                         $queueToken['text'] = ' '.$ch;
 205                                                         $this->mQueuedToken[] = $queueToken;
 206                                                         $this->mPos ++;
 207                                                         break 2; // switch + while
 208                                                 }
 209                                                 break;
 210                                         case '0': // A space between two numbers is used to ease reading
 211                                         case '1': // of big numbers, e.g. 1 000 000. Those spaces need
 212                                         case '2': // to be unbreakable
 213                                         case '3':
 214                                         case '4':
 215                                         case '5':
 216                                         case '6':
 217                                         case '7':
 218                                         case '8':
 219                                         case '9':
 220                                                 if (    ($this->mTextLength >= $this->mPos +2)
 221                                                      && ($this->mText[$this->mPos+1] == ' ')
 222                                                      && ctype_digit( $this->mText[$this->mPos+2] ) )
 223                                                 {
 224                                                         $queueToken['type'] = 'blank';
 225                                                         $queueToken['text'] = $ch . ' ';
 226                                                         $this->mQueuedToken[] = $queueToken;
 227                                                         $this->mPos += 2;
 228                                                         break 2; // switch + while
 229                                                 }
 230                                                 break;
 231                                         case "\302": // first byte of UTF-8 Character Guillemet-left
 232                                                 if ( $this->continues( "\253 ") ) // second byte and a blank
 233                                                 {
 234                                                         $queueToken['type'] = 'blank';
 235                                                         $queueToken['text'] = "\302\253 ";
 236                                                         $this->mQueuedToken[] = $queueToken;
 237                                                         $this->mPos += 3;
 238                                                         break 2; // switch + while
 239                                                 }
 240                                                 break;
 241                                         case "\273": //last byte of UTF-8 Character Guillemet-right
 242                                                 if ( $this->preceeded( " \302" ) )
 243                                                 {
 244                                                         $queueToken['type'] = 'blank';
 245                                                         $queueToken['text'] = " \302\273";
 246                                                         $token['text'] = substr( $token['text'], 0, -2 );
 247                                                         $this->mQueuedToken[] = $queueToken;
 248                                                         $this->mPos ++;
 249                                                         break 2; // switch + while
 250                                                 }
 251                                                 break;
 252                                         case '&': //extensions like <timeline>, since HTML stripping has already been done,
 253                                                   //those look like &lt;timeline&gt;
 254                                                 if ( $this->continues( 'lt;timeline&gt;' ) )
 255                                                 {
 256                                                         $queueToken['type'] = '<timeline>';
 257                                                         $queueToken['text'] = '&lt;timeline&gt;';
 258                                                         $this->mQueuedToken[] = $queueToken;
 259                                                         $this->mPos += 16;
 260                                                         break 2; // switch + while
 261                                                 }
 262                                                 break;
 263
 264                                 } /* switch */
 265                                 $token['text'].=$ch;
 266                                 $this->mPos ++;
 267                                 // echo $this->mPos . "<br>\n";
 268                         } /* while */
 269                 } /* if (nothing left in queue) */
 270
 271                 wfProfileOut( $fname );
 272                 return $token;
 273         }
 274
 275         /**
 276          * function continues
 277          *
 278          * checks whether the mText continues with $cont from mPos+1
 279          *
 280          * @access private
 281          */
 282         function continues( $cont ) {
 283                 // If string is not long enough to contain $cont, return false
 284                 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
 285                         return false;
 286                 for ( $i=0; $i < strlen( $cont ); $i++ )
 287                 {
 288                         if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
 289                                 return false;
 290                 }
 291                 return true;
 292         }
 293
 294         /**
 295          * function preceeded
 296          *
 297          * checks whether the mText is preceeded by $prec at position mPos
 298          *
 299          * @access private
 300          */
 301         function preceeded( $prec ) {
 302                 $len = strlen( $prec );
 303                 // if $prec is longer than the text up to mPos, return false
 304                 if ( $this->mPos < $len )
 305                         return false;
 306                 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
 307         }
 308
 309         /**
 310          *
 311          */
 312         function readAllUntil( $border ) {
 313                 $n = strpos( $this->mText, $border, $this->mPos );
 314                 if ( $n === false )
 315                         return '';
 316                 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
 317                 $this->mPos = $n + strlen( $border ) + 1;
 318                 return $ret;
 319         }
 320
 321 }