3 * Author Markus Baker: http://www.lastcraft.com
4 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
10 * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
16 if(!defined('DOKU_INC')) die('meh.');
21 define("DOKU_LEXER_ENTER", 1);
22 define("DOKU_LEXER_MATCHED", 2);
23 define("DOKU_LEXER_UNMATCHED", 3);
24 define("DOKU_LEXER_EXIT", 4);
25 define("DOKU_LEXER_SPECIAL", 5);
29 * Compounded regular expression. Any of
30 * the contained patterns could match and
31 * when one does it's label is returned.
35 class Doku_LexerParallelRegex
{
42 * Constructor. Starts with no patterns.
43 * @param boolean $case True for case sensitive, false
47 function Doku_LexerParallelRegex($case) {
49 $this->_patterns
= array();
50 $this->_labels
= array();
55 * Adds a pattern with an optional label.
56 * @param mixed $pattern Perl style regex. Must be UTF-8
57 * encoded. If its a string, the (, )
58 * lose their meaning unless they
59 * form part of a lookahead or
60 * lookbehind assertation.
61 * @param string $label Label of regex to be returned
62 * on a match. Label must be ASCII
65 function addPattern($pattern, $label = true) {
66 $count = count($this->_patterns
);
67 $this->_patterns
[$count] = $pattern;
68 $this->_labels
[$count] = $label;
73 * Attempts to match all patterns at once against
75 * @param string $subject String to match against.
76 * @param string $match First matched portion of
78 * @return boolean True on success.
81 function match($subject, &$match) {
82 if (count($this->_patterns
) == 0) {
85 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
91 $size = count($matches);
92 for ($i = 1; $i < $size; $i++
) {
93 if ($matches[$i] && isset($this->_labels
[$i - 1])) {
94 return $this->_labels
[$i - 1];
101 * Attempts to split the string against all patterns at once
103 * @param string $subject String to match against.
104 * @param array $split The split result: array containing, pre-match, match & post-match strings
105 * @return boolean True on success.
108 * @author Christopher Smith <chris@jalakai.co.uk>
110 function split($subject, &$split) {
111 if (count($this->_patterns
) == 0) {
115 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
116 if(function_exists('preg_last_error')){
117 $err = preg_last_error();
119 case PREG_BACKTRACK_LIMIT_ERROR
:
120 msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
122 case PREG_RECURSION_LIMIT_ERROR
:
123 msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
125 case PREG_BAD_UTF8_ERROR
:
126 msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
128 case PREG_INTERNAL_ERROR
:
129 msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
134 $split = array($subject, "", "");
138 $idx = count($matches)-2;
139 list($pre, $post) = preg_split($this->_patterns
[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
140 $split = array($pre, $matches[0], $post);
142 return isset($this->_labels
[$idx]) ?
$this->_labels
[$idx] : true;
146 * Compounds the patterns into a single
147 * regular expression separated with the
148 * "or" operator. Caches the regex.
149 * Will automatically escape (, ) and / tokens.
150 * @param array $patterns List of patterns in order.
153 function _getCompoundedRegex() {
154 if ($this->_regex
== null) {
155 $cnt = count($this->_patterns
);
156 for ($i = 0; $i < $cnt; $i++
) {
159 * decompose the input pattern into "(", "(?", ")",
160 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
163 preg_match_all('/\\\\.|' .
166 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
167 '[^[()\\\\]+/', $this->_patterns
[$i], $elts);
172 foreach ($elts[0] as $elt) {
174 * for "(", ")" remember the nesting level, add "\"
175 * only to the non-"(?" ones.
184 $level--; /* closing (? */
194 if (substr($elt, 0, 1) == '\\')
197 $pattern .= str_replace('/', '\/', $elt);
200 $this->_patterns
[$i] = "($pattern)";
202 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
204 return $this->_regex
;
208 * Accessor for perl regex mode flags to use.
209 * @return string Perl regex flags.
212 function _getPerlMatchingFlags() {
213 return ($this->_case ?
"msS" : "msSi");
218 * States for a stack machine.
222 class Doku_LexerStateStack
{
226 * Constructor. Starts in named state.
227 * @param string $start Starting state name.
230 function Doku_LexerStateStack($start) {
231 $this->_stack
= array($start);
235 * Accessor for current state.
236 * @return string State.
239 function getCurrent() {
240 return $this->_stack
[count($this->_stack
) - 1];
244 * Adds a state to the stack and sets it
245 * to be the current state.
246 * @param string $state New state.
249 function enter($state) {
250 array_push($this->_stack
, $state);
254 * Leaves the current state and reverts
255 * to the previous one.
256 * @return boolean False if we drop off
257 * the bottom of the list.
261 if (count($this->_stack
) == 1) {
264 array_pop($this->_stack
);
270 * Accepts text and breaks it into tokens.
271 * Some optimisation to make the sure the
272 * content is only scanned by the PHP regex
273 * parser once. Lexer modes must not start
274 * with leading underscores.
286 * Sets up the lexer in case insensitive matching
288 * @param Doku_Parser $parser Handling strategy by
290 * @param string $start Starting handler.
291 * @param boolean $case True for case sensitive.
294 function Doku_Lexer(&$parser, $start = "accept", $case = false) {
295 $this->_case
= $case;
296 $this->_regexes
= array();
297 $this->_parser
= &$parser;
298 $this->_mode
= &new Doku_LexerStateStack($start);
299 $this->_mode_handlers
= array();
303 * Adds a token search pattern for a particular
304 * parsing mode. The pattern does not change the
306 * @param string $pattern Perl style regex, but ( and )
307 * lose the usual meaning.
308 * @param string $mode Should only apply this
309 * pattern when dealing with
310 * this type of input.
313 function addPattern($pattern, $mode = "accept") {
314 if (! isset($this->_regexes
[$mode])) {
315 $this->_regexes
[$mode] = new Doku_LexerParallelRegex($this->_case
);
317 $this->_regexes
[$mode]->addPattern($pattern);
321 * Adds a pattern that will enter a new parsing
322 * mode. Useful for entering parenthesis, strings,
324 * @param string $pattern Perl style regex, but ( and )
325 * lose the usual meaning.
326 * @param string $mode Should only apply this
327 * pattern when dealing with
328 * this type of input.
329 * @param string $new_mode Change parsing to this new
333 function addEntryPattern($pattern, $mode, $new_mode) {
334 if (! isset($this->_regexes
[$mode])) {
335 $this->_regexes
[$mode] = new Doku_LexerParallelRegex($this->_case
);
337 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
341 * Adds a pattern that will exit the current mode
342 * and re-enter the previous one.
343 * @param string $pattern Perl style regex, but ( and )
344 * lose the usual meaning.
345 * @param string $mode Mode to leave.
348 function addExitPattern($pattern, $mode) {
349 if (! isset($this->_regexes
[$mode])) {
350 $this->_regexes
[$mode] = new Doku_LexerParallelRegex($this->_case
);
352 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
356 * Adds a pattern that has a special mode. Acts as an entry
357 * and exit pattern in one go, effectively calling a special
358 * parser handler for this token only.
359 * @param string $pattern Perl style regex, but ( and )
360 * lose the usual meaning.
361 * @param string $mode Should only apply this
362 * pattern when dealing with
363 * this type of input.
364 * @param string $special Use this mode for this one token.
367 function addSpecialPattern($pattern, $mode, $special) {
368 if (! isset($this->_regexes
[$mode])) {
369 $this->_regexes
[$mode] = new Doku_LexerParallelRegex($this->_case
);
371 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
375 * Adds a mapping from a mode to another handler.
376 * @param string $mode Mode to be remapped.
377 * @param string $handler New target handler.
380 function mapHandler($mode, $handler) {
381 $this->_mode_handlers
[$mode] = $handler;
385 * Splits the page text into tokens. Will fail
386 * if the handlers report an error or if no
387 * content is consumed. If successful then each
388 * unparsed and parsed token invokes a call to the
390 * @param string $raw Raw HTML text.
391 * @return boolean True on success, else false.
394 function parse($raw) {
395 if (! isset($this->_parser
)) {
398 $initialLength = strlen($raw);
399 $length = $initialLength;
401 while (is_array($parsed = $this->_reduce($raw))) {
402 list($unmatched, $matched, $mode) = $parsed;
403 $currentLength = strlen($raw);
404 $matchPos = $initialLength - $currentLength - strlen($matched);
405 if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
408 if ($currentLength == $length) {
411 $length = $currentLength;
412 $pos = $initialLength - $currentLength;
417 return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED
, $pos);
421 * Sends the matched token and any leading unmatched
422 * text to the parser changing the lexer to a new
423 * mode if one is listed.
424 * @param string $unmatched Unmatched leading portion.
425 * @param string $matched Actual token match.
426 * @param string $mode Mode after match. A boolean
427 * false mode causes no change.
428 * @param int $pos Current byte index location in raw doc
430 * @return boolean False if there was any error
434 function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
435 if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED
, $initialPos) ){
438 if ($this->_isModeEnd($mode)) {
439 if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT
, $matchPos)) {
442 return $this->_mode
->leave();
444 if ($this->_isSpecialMode($mode)) {
445 $this->_mode
->enter($this->_decodeSpecial($mode));
446 if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL
, $matchPos)) {
449 return $this->_mode
->leave();
451 if (is_string($mode)) {
452 $this->_mode
->enter($mode);
453 return $this->_invokeParser($matched, DOKU_LEXER_ENTER
, $matchPos);
455 return $this->_invokeParser($matched, DOKU_LEXER_MATCHED
, $matchPos);
459 * Tests to see if the new mode is actually to leave
460 * the current mode and pop an item from the matching
462 * @param string $mode Mode to test.
463 * @return boolean True if this is the exit mode.
466 function _isModeEnd($mode) {
467 return ($mode === "__exit");
471 * Test to see if the mode is one where this mode
472 * is entered for this token only and automatically
473 * leaves immediately afterwoods.
474 * @param string $mode Mode to test.
475 * @return boolean True if this is the exit mode.
478 function _isSpecialMode($mode) {
479 return (strncmp($mode, "_", 1) == 0);
483 * Strips the magic underscore marking single token
485 * @param string $mode Mode to decode.
486 * @return string Underlying mode name.
489 function _decodeSpecial($mode) {
490 return substr($mode, 1);
494 * Calls the parser method named after the current
495 * mode. Empty content will be ignored. The lexer
496 * has a parser handler for each mode in the lexer.
497 * @param string $content Text parsed.
498 * @param boolean $is_match Token is recognised rather
499 * than unparsed data.
500 * @param int $pos Current byte index location in raw doc
504 function _invokeParser($content, $is_match, $pos) {
505 if (($content === "") ||
($content === false)) {
508 $handler = $this->_mode
->getCurrent();
509 if (isset($this->_mode_handlers
[$handler])) {
510 $handler = $this->_mode_handlers
[$handler];
513 // modes starting with plugin_ are all handled by the same
514 // handler but with an additional parameter
515 if(substr($handler,0,7)=='plugin_'){
516 list($handler,$plugin) = explode('_',$handler,2);
517 return $this->_parser
->$handler($content, $is_match, $pos, $plugin);
520 return $this->_parser
->$handler($content, $is_match, $pos);
524 * Tries to match a chunk of text and if successful
525 * removes the recognised chunk and any leading
526 * unparsed data. Empty strings will not be matched.
527 * @param string $raw The subject to parse. This is the
528 * content that will be eaten.
529 * @return array Three item list of unparsed
530 * content followed by the
531 * recognised token and finally the
532 * action the parser is to take.
533 * True if no match, false if there
534 * is a parsing error.
537 function _reduce(&$raw) {
538 if (! isset($this->_regexes
[$this->_mode
->getCurrent()])) {
544 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->split($raw, $split)) {
545 list($unparsed, $match, $raw) = $split;
546 return array($unparsed, $match, $action);
553 * Escapes regex characters other than (, ) and /
556 function Doku_Lexer_Escape($str) {
557 //$str = addslashes($str);
597 return preg_replace($chars, $escaped, $str);
600 //Setup VIM: ex: et ts=4 sw=4 enc=utf-8 :