inc/parser/lexer.php

   1 <?php
   2 /**
   3 * Author Markus Baker: http://www.lastcraft.com
   4 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5 * For an intro to the Lexer see:
   6 * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
   7 * @author Marcus Baker
   8 * @package Doku
   9 * @subpackage Lexer
  10 * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
  11 */
  12
  13 /**
  14 * Init path constant
  15 */
  16 if(!defined('DOKU_INC')) die('meh.');
  17
  18 /**#@+
  19  * lexer mode constant
  20  */
  21 define("DOKU_LEXER_ENTER", 1);
  22 define("DOKU_LEXER_MATCHED", 2);
  23 define("DOKU_LEXER_UNMATCHED", 3);
  24 define("DOKU_LEXER_EXIT", 4);
  25 define("DOKU_LEXER_SPECIAL", 5);
  26 /**#@-*/
  27
  28 /**
  29  *    Compounded regular expression. Any of
  30  *    the contained patterns could match and
  31  *    when one does it's label is returned.
  32  *    @package Doku
  33  *    @subpackage Lexer
  34  */
  35 class Doku_LexerParallelRegex {
  36     var $_patterns;
  37     var $_labels;
  38     var $_regex;
  39     var $_case;
  40
  41     /**
  42      *    Constructor. Starts with no patterns.
  43      *    @param boolean $case    True for case sensitive, false
  44      *                            for insensitive.
  45      *    @access public
  46      */
  47     function Doku_LexerParallelRegex($case) {
  48         $this->_case = $case;
  49         $this->_patterns = array();
  50         $this->_labels = array();
  51         $this->_regex = null;
  52     }
  53
  54     /**
  55      *    Adds a pattern with an optional label.
  56      *    @param mixed $pattern       Perl style regex. Must be UTF-8
  57      *                                encoded. If its a string, the (, )
  58      *                                lose their meaning unless they
  59      *                                form part of a lookahead or
  60      *                                lookbehind assertation.
  61      *    @param string $label        Label of regex to be returned
  62      *                                on a match. Label must be ASCII
  63      *    @access public
  64      */
  65     function addPattern($pattern, $label = true) {
  66         $count = count($this->_patterns);
  67         $this->_patterns[$count] = $pattern;
  68         $this->_labels[$count] = $label;
  69         $this->_regex = null;
  70     }
  71
  72     /**
  73      *    Attempts to match all patterns at once against
  74      *    a string.
  75      *    @param string $subject      String to match against.
  76      *    @param string $match        First matched portion of
  77      *                                subject.
  78      *    @return boolean             True on success.
  79      *    @access public
  80      */
  81     function match($subject, &$match) {
  82         if (count($this->_patterns) == 0) {
  83             return false;
  84         }
  85         if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  86             $match = "";
  87             return false;
  88         }
  89
  90         $match = $matches[0];
  91         $size = count($matches);
  92         for ($i = 1; $i < $size; $i++) {
  93             if ($matches[$i] && isset($this->_labels[$i - 1])) {
  94                 return $this->_labels[$i - 1];
  95             }
  96         }
  97         return true;
  98     }
  99
 100     /**
 101      *    Attempts to split the string against all patterns at once
 102      *
 103      *    @param string $subject      String to match against.
 104      *    @param array $split         The split result: array containing, pre-match, match & post-match strings
 105      *    @return boolean             True on success.
 106      *    @access public
 107      *
 108      *    @author Christopher Smith <chris@jalakai.co.uk>
 109      */
 110     function split($subject, &$split) {
 111         if (count($this->_patterns) == 0) {
 112             return false;
 113         }
 114
 115         if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
 116             if(function_exists('preg_last_error')){
 117                 $err = preg_last_error();
 118                 switch($err){
 119                     case PREG_BACKTRACK_LIMIT_ERROR:
 120                         msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
 121                         break;
 122                     case PREG_RECURSION_LIMIT_ERROR:
 123                         msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
 124                         break;
 125                     case PREG_BAD_UTF8_ERROR:
 126                         msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
 127                         break;
 128                     case PREG_INTERNAL_ERROR:
 129                         msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
 130                         break;
 131                 }
 132             }
 133
 134             $split = array($subject, "", "");
 135             return false;
 136         }
 137
 138         $idx = count($matches)-2;
 139         list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
 140         $split = array($pre, $matches[0], $post);
 141
 142         return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
 143     }
 144
 145     /**
 146      *    Compounds the patterns into a single
 147      *    regular expression separated with the
 148      *    "or" operator. Caches the regex.
 149      *    Will automatically escape (, ) and / tokens.
 150      *    @param array $patterns    List of patterns in order.
 151      *    @access private
 152      */
 153     function _getCompoundedRegex() {
 154         if ($this->_regex == null) {
 155             $cnt = count($this->_patterns);
 156             for ($i = 0; $i < $cnt; $i++) {
 157
 158                 /*
 159                  * decompose the input pattern into "(", "(?", ")",
 160                  * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
 161                  * elements.
 162                  */
 163                 preg_match_all('/\\\\.|' .
 164                                '\(\?|' .
 165                                '[()]|' .
 166                                '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
 167                                '[^[()\\\\]+/', $this->_patterns[$i], $elts);
 168
 169                 $pattern = "";
 170                 $level = 0;
 171
 172                 foreach ($elts[0] as $elt) {
 173                     /*
 174                      * for "(", ")" remember the nesting level, add "\"
 175                      * only to the non-"(?" ones.
 176                      */
 177
 178                     switch($elt) {
 179                     case '(':
 180                         $pattern .= '\(';
 181                         break;
 182                     case ')':
 183                         if ($level > 0)
 184                             $level--; /* closing (? */
 185                         else
 186                             $pattern .= '\\';
 187                         $pattern .= ')';
 188                         break;
 189                     case '(?':
 190                         $level++;
 191                         $pattern .= '(?';
 192                         break;
 193                     default:
 194                         if (substr($elt, 0, 1) == '\\')
 195                             $pattern .= $elt;
 196                         else
 197                             $pattern .= str_replace('/', '\/', $elt);
 198                     }
 199                 }
 200                 $this->_patterns[$i] = "($pattern)";
 201             }
 202             $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 203         }
 204         return $this->_regex;
 205     }
 206
 207     /**
 208      *    Accessor for perl regex mode flags to use.
 209      *    @return string       Perl regex flags.
 210      *    @access private
 211      */
 212     function _getPerlMatchingFlags() {
 213         return ($this->_case ? "msS" : "msSi");
 214     }
 215 }
 216
 217 /**
 218  *    States for a stack machine.
 219  *    @package Lexer
 220  *    @subpackage Lexer
 221  */
 222 class Doku_LexerStateStack {
 223     var $_stack;
 224
 225     /**
 226      *    Constructor. Starts in named state.
 227      *    @param string $start        Starting state name.
 228      *    @access public
 229      */
 230     function Doku_LexerStateStack($start) {
 231         $this->_stack = array($start);
 232     }
 233
 234     /**
 235      *    Accessor for current state.
 236      *    @return string       State.
 237      *    @access public
 238      */
 239     function getCurrent() {
 240         return $this->_stack[count($this->_stack) - 1];
 241     }
 242
 243     /**
 244      *    Adds a state to the stack and sets it
 245      *    to be the current state.
 246      *    @param string $state        New state.
 247      *    @access public
 248      */
 249     function enter($state) {
 250         array_push($this->_stack, $state);
 251     }
 252
 253     /**
 254      *    Leaves the current state and reverts
 255      *    to the previous one.
 256      *    @return boolean    False if we drop off
 257      *                       the bottom of the list.
 258      *    @access public
 259      */
 260     function leave() {
 261         if (count($this->_stack) == 1) {
 262             return false;
 263         }
 264         array_pop($this->_stack);
 265         return true;
 266     }
 267 }
 268
 269 /**
 270  *    Accepts text and breaks it into tokens.
 271  *    Some optimisation to make the sure the
 272  *    content is only scanned by the PHP regex
 273  *    parser once. Lexer modes must not start
 274  *    with leading underscores.
 275  *    @package Doku
 276  *    @subpackage Lexer
 277  */
 278 class Doku_Lexer {
 279     var $_regexes;
 280     var $_parser;
 281     var $_mode;
 282     var $_mode_handlers;
 283     var $_case;
 284
 285     /**
 286      *    Sets up the lexer in case insensitive matching
 287      *    by default.
 288      *    @param Doku_Parser $parser  Handling strategy by
 289      *                                    reference.
 290      *    @param string $start            Starting handler.
 291      *    @param boolean $case            True for case sensitive.
 292      *    @access public
 293      */
 294     function Doku_Lexer(&$parser, $start = "accept", $case = false) {
 295         $this->_case = $case;
 296         $this->_regexes = array();
 297         $this->_parser = &$parser;
 298         $this->_mode = &new Doku_LexerStateStack($start);
 299         $this->_mode_handlers = array();
 300     }
 301
 302     /**
 303      *    Adds a token search pattern for a particular
 304      *    parsing mode. The pattern does not change the
 305      *    current mode.
 306      *    @param string $pattern      Perl style regex, but ( and )
 307      *                                lose the usual meaning.
 308      *    @param string $mode         Should only apply this
 309      *                                pattern when dealing with
 310      *                                this type of input.
 311      *    @access public
 312      */
 313     function addPattern($pattern, $mode = "accept") {
 314         if (! isset($this->_regexes[$mode])) {
 315             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 316         }
 317         $this->_regexes[$mode]->addPattern($pattern);
 318     }
 319
 320     /**
 321      *    Adds a pattern that will enter a new parsing
 322      *    mode. Useful for entering parenthesis, strings,
 323      *    tags, etc.
 324      *    @param string $pattern      Perl style regex, but ( and )
 325      *                                lose the usual meaning.
 326      *    @param string $mode         Should only apply this
 327      *                                pattern when dealing with
 328      *                                this type of input.
 329      *    @param string $new_mode     Change parsing to this new
 330      *                                nested mode.
 331      *    @access public
 332      */
 333     function addEntryPattern($pattern, $mode, $new_mode) {
 334         if (! isset($this->_regexes[$mode])) {
 335             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 336         }
 337         $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 338     }
 339
 340     /**
 341      *    Adds a pattern that will exit the current mode
 342      *    and re-enter the previous one.
 343      *    @param string $pattern      Perl style regex, but ( and )
 344      *                                lose the usual meaning.
 345      *    @param string $mode         Mode to leave.
 346      *    @access public
 347      */
 348     function addExitPattern($pattern, $mode) {
 349         if (! isset($this->_regexes[$mode])) {
 350             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 351         }
 352         $this->_regexes[$mode]->addPattern($pattern, "__exit");
 353     }
 354
 355     /**
 356      *    Adds a pattern that has a special mode. Acts as an entry
 357      *    and exit pattern in one go, effectively calling a special
 358      *    parser handler for this token only.
 359      *    @param string $pattern      Perl style regex, but ( and )
 360      *                                lose the usual meaning.
 361      *    @param string $mode         Should only apply this
 362      *                                pattern when dealing with
 363      *                                this type of input.
 364      *    @param string $special      Use this mode for this one token.
 365      *    @access public
 366      */
 367     function addSpecialPattern($pattern, $mode, $special) {
 368         if (! isset($this->_regexes[$mode])) {
 369             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 370         }
 371         $this->_regexes[$mode]->addPattern($pattern, "_$special");
 372     }
 373
 374     /**
 375      *    Adds a mapping from a mode to another handler.
 376      *    @param string $mode        Mode to be remapped.
 377      *    @param string $handler     New target handler.
 378      *    @access public
 379      */
 380     function mapHandler($mode, $handler) {
 381         $this->_mode_handlers[$mode] = $handler;
 382     }
 383
 384     /**
 385      *    Splits the page text into tokens. Will fail
 386      *    if the handlers report an error or if no
 387      *    content is consumed. If successful then each
 388      *    unparsed and parsed token invokes a call to the
 389      *    held listener.
 390      *    @param string $raw        Raw HTML text.
 391      *    @return boolean           True on success, else false.
 392      *    @access public
 393      */
 394     function parse($raw) {
 395         if (! isset($this->_parser)) {
 396             return false;
 397         }
 398         $initialLength = strlen($raw);
 399         $length = $initialLength;
 400         $pos = 0;
 401         while (is_array($parsed = $this->_reduce($raw))) {
 402             list($unmatched, $matched, $mode) = $parsed;
 403             $currentLength = strlen($raw);
 404             $matchPos = $initialLength - $currentLength - strlen($matched);
 405             if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 406                 return false;
 407             }
 408             if ($currentLength == $length) {
 409                 return false;
 410             }
 411             $length = $currentLength;
 412             $pos = $initialLength - $currentLength;
 413         }
 414         if (!$parsed) {
 415             return false;
 416         }
 417         return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
 418     }
 419
 420     /**
 421      *    Sends the matched token and any leading unmatched
 422      *    text to the parser changing the lexer to a new
 423      *    mode if one is listed.
 424      *    @param string $unmatched    Unmatched leading portion.
 425      *    @param string $matched      Actual token match.
 426      *    @param string $mode         Mode after match. A boolean
 427      *                                false mode causes no change.
 428      *    @param int $pos         Current byte index location in raw doc
 429      *                                thats being parsed
 430      *    @return boolean             False if there was any error
 431      *                                from the parser.
 432      *    @access private
 433      */
 434     function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
 435         if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
 436             return false;
 437         }
 438         if ($this->_isModeEnd($mode)) {
 439             if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
 440                 return false;
 441             }
 442             return $this->_mode->leave();
 443         }
 444         if ($this->_isSpecialMode($mode)) {
 445             $this->_mode->enter($this->_decodeSpecial($mode));
 446             if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 447                 return false;
 448             }
 449             return $this->_mode->leave();
 450         }
 451         if (is_string($mode)) {
 452             $this->_mode->enter($mode);
 453             return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
 454         }
 455         return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
 456     }
 457
 458     /**
 459      *    Tests to see if the new mode is actually to leave
 460      *    the current mode and pop an item from the matching
 461      *    mode stack.
 462      *    @param string $mode    Mode to test.
 463      *    @return boolean        True if this is the exit mode.
 464      *    @access private
 465      */
 466     function _isModeEnd($mode) {
 467         return ($mode === "__exit");
 468     }
 469
 470     /**
 471      *    Test to see if the mode is one where this mode
 472      *    is entered for this token only and automatically
 473      *    leaves immediately afterwoods.
 474      *    @param string $mode    Mode to test.
 475      *    @return boolean        True if this is the exit mode.
 476      *    @access private
 477      */
 478     function _isSpecialMode($mode) {
 479         return (strncmp($mode, "_", 1) == 0);
 480     }
 481
 482     /**
 483      *    Strips the magic underscore marking single token
 484      *    modes.
 485      *    @param string $mode    Mode to decode.
 486      *    @return string         Underlying mode name.
 487      *    @access private
 488      */
 489     function _decodeSpecial($mode) {
 490         return substr($mode, 1);
 491     }
 492
 493     /**
 494      *    Calls the parser method named after the current
 495      *    mode. Empty content will be ignored. The lexer
 496      *    has a parser handler for each mode in the lexer.
 497      *    @param string $content        Text parsed.
 498      *    @param boolean $is_match      Token is recognised rather
 499      *                                  than unparsed data.
 500      *    @param int $pos         Current byte index location in raw doc
 501      *                                thats being parsed
 502      *    @access private
 503      */
 504     function _invokeParser($content, $is_match, $pos) {
 505         if (($content === "") || ($content === false)) {
 506             return true;
 507         }
 508         $handler = $this->_mode->getCurrent();
 509         if (isset($this->_mode_handlers[$handler])) {
 510             $handler = $this->_mode_handlers[$handler];
 511         }
 512
 513         // modes starting with plugin_ are all handled by the same
 514         // handler but with an additional parameter
 515         if(substr($handler,0,7)=='plugin_'){
 516           list($handler,$plugin) = explode('_',$handler,2);
 517               return $this->_parser->$handler($content, $is_match, $pos, $plugin);
 518         }
 519
 520             return $this->_parser->$handler($content, $is_match, $pos);
 521         }
 522
 523     /**
 524      *    Tries to match a chunk of text and if successful
 525      *    removes the recognised chunk and any leading
 526      *    unparsed data. Empty strings will not be matched.
 527      *    @param string $raw         The subject to parse. This is the
 528      *                               content that will be eaten.
 529      *    @return array              Three item list of unparsed
 530      *                               content followed by the
 531      *                               recognised token and finally the
 532      *                               action the parser is to take.
 533      *                               True if no match, false if there
 534      *                               is a parsing error.
 535      *    @access private
 536      */
 537     function _reduce(&$raw) {
 538         if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 539             return false;
 540         }
 541         if ($raw === "") {
 542             return true;
 543         }
 544         if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
 545             list($unparsed, $match, $raw) = $split;
 546             return array($unparsed, $match, $action);
 547         }
 548         return true;
 549     }
 550 }
 551
 552 /**
 553 * Escapes regex characters other than (, ) and /
 554 * @TODO
 555 */
 556 function Doku_Lexer_Escape($str) {
 557     //$str = addslashes($str);
 558     $chars = array(
 559         '/\\\\/',
 560         '/\./',
 561         '/\+/',
 562         '/\*/',
 563         '/\?/',
 564         '/\[/',
 565         '/\^/',
 566         '/\]/',
 567         '/\$/',
 568         '/\{/',
 569         '/\}/',
 570         '/\=/',
 571         '/\!/',
 572         '/\</',
 573         '/\>/',
 574         '/\|/',
 575         '/\:/'
 576         );
 577
 578     $escaped = array(
 579         '\\\\\\\\',
 580         '\.',
 581         '\+',
 582         '\*',
 583         '\?',
 584         '\[',
 585         '\^',
 586         '\]',
 587         '\$',
 588         '\{',
 589         '\}',
 590         '\=',
 591         '\!',
 592         '\<',
 593         '\>',
 594         '\|',
 595         '\:'
 596         );
 597     return preg_replace($chars, $escaped, $str);
 598 }
 599
 600 //Setup VIM: ex: et ts=4 sw=4 enc=utf-8 :