first commit. dokuwiki.
[h2N7SspZmY.git] / inc / parser / lexer.php
blobafd260a05cbaa951f69244166063960c71e4c025
1 <?php
2 /**
3 * Author Markus Baker: http://www.lastcraft.com
4 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
7 * @author Marcus Baker
8 * @package Doku
9 * @subpackage Lexer
10 * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
13 /**
14 * Init path constant
16 if(!defined('DOKU_INC')) die('meh.');
18 /**#@+
19 * lexer mode constant
21 define("DOKU_LEXER_ENTER", 1);
22 define("DOKU_LEXER_MATCHED", 2);
23 define("DOKU_LEXER_UNMATCHED", 3);
24 define("DOKU_LEXER_EXIT", 4);
25 define("DOKU_LEXER_SPECIAL", 5);
26 /**#@-*/
28 /**
29 * Compounded regular expression. Any of
30 * the contained patterns could match and
31 * when one does it's label is returned.
32 * @package Doku
33 * @subpackage Lexer
35 class Doku_LexerParallelRegex {
36 var $_patterns;
37 var $_labels;
38 var $_regex;
39 var $_case;
41 /**
42 * Constructor. Starts with no patterns.
43 * @param boolean $case True for case sensitive, false
44 * for insensitive.
45 * @access public
47 function Doku_LexerParallelRegex($case) {
48 $this->_case = $case;
49 $this->_patterns = array();
50 $this->_labels = array();
51 $this->_regex = null;
54 /**
55 * Adds a pattern with an optional label.
56 * @param mixed $pattern Perl style regex. Must be UTF-8
57 * encoded. If its a string, the (, )
58 * lose their meaning unless they
59 * form part of a lookahead or
60 * lookbehind assertation.
61 * @param string $label Label of regex to be returned
62 * on a match. Label must be ASCII
63 * @access public
65 function addPattern($pattern, $label = true) {
66 $count = count($this->_patterns);
67 $this->_patterns[$count] = $pattern;
68 $this->_labels[$count] = $label;
69 $this->_regex = null;
72 /**
73 * Attempts to match all patterns at once against
74 * a string.
75 * @param string $subject String to match against.
76 * @param string $match First matched portion of
77 * subject.
78 * @return boolean True on success.
79 * @access public
81 function match($subject, &$match) {
82 if (count($this->_patterns) == 0) {
83 return false;
85 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
86 $match = "";
87 return false;
90 $match = $matches[0];
91 $size = count($matches);
92 for ($i = 1; $i < $size; $i++) {
93 if ($matches[$i] && isset($this->_labels[$i - 1])) {
94 return $this->_labels[$i - 1];
97 return true;
101 * Attempts to split the string against all patterns at once
103 * @param string $subject String to match against.
104 * @param array $split The split result: array containing, pre-match, match & post-match strings
105 * @return boolean True on success.
106 * @access public
108 * @author Christopher Smith <chris@jalakai.co.uk>
110 function split($subject, &$split) {
111 if (count($this->_patterns) == 0) {
112 return false;
115 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
116 if(function_exists('preg_last_error')){
117 $err = preg_last_error();
118 switch($err){
119 case PREG_BACKTRACK_LIMIT_ERROR:
120 msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
121 break;
122 case PREG_RECURSION_LIMIT_ERROR:
123 msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
124 break;
125 case PREG_BAD_UTF8_ERROR:
126 msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
127 break;
128 case PREG_INTERNAL_ERROR:
129 msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
130 break;
134 $split = array($subject, "", "");
135 return false;
138 $idx = count($matches)-2;
139 list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
140 $split = array($pre, $matches[0], $post);
142 return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
146 * Compounds the patterns into a single
147 * regular expression separated with the
148 * "or" operator. Caches the regex.
149 * Will automatically escape (, ) and / tokens.
150 * @param array $patterns List of patterns in order.
151 * @access private
153 function _getCompoundedRegex() {
154 if ($this->_regex == null) {
155 $cnt = count($this->_patterns);
156 for ($i = 0; $i < $cnt; $i++) {
159 * decompose the input pattern into "(", "(?", ")",
160 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
161 * elements.
163 preg_match_all('/\\\\.|' .
164 '\(\?|' .
165 '[()]|' .
166 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
167 '[^[()\\\\]+/', $this->_patterns[$i], $elts);
169 $pattern = "";
170 $level = 0;
172 foreach ($elts[0] as $elt) {
174 * for "(", ")" remember the nesting level, add "\"
175 * only to the non-"(?" ones.
178 switch($elt) {
179 case '(':
180 $pattern .= '\(';
181 break;
182 case ')':
183 if ($level > 0)
184 $level--; /* closing (? */
185 else
186 $pattern .= '\\';
187 $pattern .= ')';
188 break;
189 case '(?':
190 $level++;
191 $pattern .= '(?';
192 break;
193 default:
194 if (substr($elt, 0, 1) == '\\')
195 $pattern .= $elt;
196 else
197 $pattern .= str_replace('/', '\/', $elt);
200 $this->_patterns[$i] = "($pattern)";
202 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
204 return $this->_regex;
208 * Accessor for perl regex mode flags to use.
209 * @return string Perl regex flags.
210 * @access private
212 function _getPerlMatchingFlags() {
213 return ($this->_case ? "msS" : "msSi");
218 * States for a stack machine.
219 * @package Lexer
220 * @subpackage Lexer
222 class Doku_LexerStateStack {
223 var $_stack;
226 * Constructor. Starts in named state.
227 * @param string $start Starting state name.
228 * @access public
230 function Doku_LexerStateStack($start) {
231 $this->_stack = array($start);
235 * Accessor for current state.
236 * @return string State.
237 * @access public
239 function getCurrent() {
240 return $this->_stack[count($this->_stack) - 1];
244 * Adds a state to the stack and sets it
245 * to be the current state.
246 * @param string $state New state.
247 * @access public
249 function enter($state) {
250 array_push($this->_stack, $state);
254 * Leaves the current state and reverts
255 * to the previous one.
256 * @return boolean False if we drop off
257 * the bottom of the list.
258 * @access public
260 function leave() {
261 if (count($this->_stack) == 1) {
262 return false;
264 array_pop($this->_stack);
265 return true;
270 * Accepts text and breaks it into tokens.
271 * Some optimisation to make the sure the
272 * content is only scanned by the PHP regex
273 * parser once. Lexer modes must not start
274 * with leading underscores.
275 * @package Doku
276 * @subpackage Lexer
278 class Doku_Lexer {
279 var $_regexes;
280 var $_parser;
281 var $_mode;
282 var $_mode_handlers;
283 var $_case;
286 * Sets up the lexer in case insensitive matching
287 * by default.
288 * @param Doku_Parser $parser Handling strategy by
289 * reference.
290 * @param string $start Starting handler.
291 * @param boolean $case True for case sensitive.
292 * @access public
294 function Doku_Lexer(&$parser, $start = "accept", $case = false) {
295 $this->_case = $case;
296 $this->_regexes = array();
297 $this->_parser = &$parser;
298 $this->_mode = &new Doku_LexerStateStack($start);
299 $this->_mode_handlers = array();
303 * Adds a token search pattern for a particular
304 * parsing mode. The pattern does not change the
305 * current mode.
306 * @param string $pattern Perl style regex, but ( and )
307 * lose the usual meaning.
308 * @param string $mode Should only apply this
309 * pattern when dealing with
310 * this type of input.
311 * @access public
313 function addPattern($pattern, $mode = "accept") {
314 if (! isset($this->_regexes[$mode])) {
315 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
317 $this->_regexes[$mode]->addPattern($pattern);
321 * Adds a pattern that will enter a new parsing
322 * mode. Useful for entering parenthesis, strings,
323 * tags, etc.
324 * @param string $pattern Perl style regex, but ( and )
325 * lose the usual meaning.
326 * @param string $mode Should only apply this
327 * pattern when dealing with
328 * this type of input.
329 * @param string $new_mode Change parsing to this new
330 * nested mode.
331 * @access public
333 function addEntryPattern($pattern, $mode, $new_mode) {
334 if (! isset($this->_regexes[$mode])) {
335 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
337 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
341 * Adds a pattern that will exit the current mode
342 * and re-enter the previous one.
343 * @param string $pattern Perl style regex, but ( and )
344 * lose the usual meaning.
345 * @param string $mode Mode to leave.
346 * @access public
348 function addExitPattern($pattern, $mode) {
349 if (! isset($this->_regexes[$mode])) {
350 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
352 $this->_regexes[$mode]->addPattern($pattern, "__exit");
356 * Adds a pattern that has a special mode. Acts as an entry
357 * and exit pattern in one go, effectively calling a special
358 * parser handler for this token only.
359 * @param string $pattern Perl style regex, but ( and )
360 * lose the usual meaning.
361 * @param string $mode Should only apply this
362 * pattern when dealing with
363 * this type of input.
364 * @param string $special Use this mode for this one token.
365 * @access public
367 function addSpecialPattern($pattern, $mode, $special) {
368 if (! isset($this->_regexes[$mode])) {
369 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
371 $this->_regexes[$mode]->addPattern($pattern, "_$special");
375 * Adds a mapping from a mode to another handler.
376 * @param string $mode Mode to be remapped.
377 * @param string $handler New target handler.
378 * @access public
380 function mapHandler($mode, $handler) {
381 $this->_mode_handlers[$mode] = $handler;
385 * Splits the page text into tokens. Will fail
386 * if the handlers report an error or if no
387 * content is consumed. If successful then each
388 * unparsed and parsed token invokes a call to the
389 * held listener.
390 * @param string $raw Raw HTML text.
391 * @return boolean True on success, else false.
392 * @access public
394 function parse($raw) {
395 if (! isset($this->_parser)) {
396 return false;
398 $initialLength = strlen($raw);
399 $length = $initialLength;
400 $pos = 0;
401 while (is_array($parsed = $this->_reduce($raw))) {
402 list($unmatched, $matched, $mode) = $parsed;
403 $currentLength = strlen($raw);
404 $matchPos = $initialLength - $currentLength - strlen($matched);
405 if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
406 return false;
408 if ($currentLength == $length) {
409 return false;
411 $length = $currentLength;
412 $pos = $initialLength - $currentLength;
414 if (!$parsed) {
415 return false;
417 return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
421 * Sends the matched token and any leading unmatched
422 * text to the parser changing the lexer to a new
423 * mode if one is listed.
424 * @param string $unmatched Unmatched leading portion.
425 * @param string $matched Actual token match.
426 * @param string $mode Mode after match. A boolean
427 * false mode causes no change.
428 * @param int $pos Current byte index location in raw doc
429 * thats being parsed
430 * @return boolean False if there was any error
431 * from the parser.
432 * @access private
434 function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
435 if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
436 return false;
438 if ($this->_isModeEnd($mode)) {
439 if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
440 return false;
442 return $this->_mode->leave();
444 if ($this->_isSpecialMode($mode)) {
445 $this->_mode->enter($this->_decodeSpecial($mode));
446 if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
447 return false;
449 return $this->_mode->leave();
451 if (is_string($mode)) {
452 $this->_mode->enter($mode);
453 return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
455 return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
459 * Tests to see if the new mode is actually to leave
460 * the current mode and pop an item from the matching
461 * mode stack.
462 * @param string $mode Mode to test.
463 * @return boolean True if this is the exit mode.
464 * @access private
466 function _isModeEnd($mode) {
467 return ($mode === "__exit");
471 * Test to see if the mode is one where this mode
472 * is entered for this token only and automatically
473 * leaves immediately afterwoods.
474 * @param string $mode Mode to test.
475 * @return boolean True if this is the exit mode.
476 * @access private
478 function _isSpecialMode($mode) {
479 return (strncmp($mode, "_", 1) == 0);
483 * Strips the magic underscore marking single token
484 * modes.
485 * @param string $mode Mode to decode.
486 * @return string Underlying mode name.
487 * @access private
489 function _decodeSpecial($mode) {
490 return substr($mode, 1);
494 * Calls the parser method named after the current
495 * mode. Empty content will be ignored. The lexer
496 * has a parser handler for each mode in the lexer.
497 * @param string $content Text parsed.
498 * @param boolean $is_match Token is recognised rather
499 * than unparsed data.
500 * @param int $pos Current byte index location in raw doc
501 * thats being parsed
502 * @access private
504 function _invokeParser($content, $is_match, $pos) {
505 if (($content === "") || ($content === false)) {
506 return true;
508 $handler = $this->_mode->getCurrent();
509 if (isset($this->_mode_handlers[$handler])) {
510 $handler = $this->_mode_handlers[$handler];
513 // modes starting with plugin_ are all handled by the same
514 // handler but with an additional parameter
515 if(substr($handler,0,7)=='plugin_'){
516 list($handler,$plugin) = explode('_',$handler,2);
517 return $this->_parser->$handler($content, $is_match, $pos, $plugin);
520 return $this->_parser->$handler($content, $is_match, $pos);
524 * Tries to match a chunk of text and if successful
525 * removes the recognised chunk and any leading
526 * unparsed data. Empty strings will not be matched.
527 * @param string $raw The subject to parse. This is the
528 * content that will be eaten.
529 * @return array Three item list of unparsed
530 * content followed by the
531 * recognised token and finally the
532 * action the parser is to take.
533 * True if no match, false if there
534 * is a parsing error.
535 * @access private
537 function _reduce(&$raw) {
538 if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
539 return false;
541 if ($raw === "") {
542 return true;
544 if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
545 list($unparsed, $match, $raw) = $split;
546 return array($unparsed, $match, $action);
548 return true;
553 * Escapes regex characters other than (, ) and /
554 * @TODO
556 function Doku_Lexer_Escape($str) {
557 //$str = addslashes($str);
558 $chars = array(
559 '/\\\\/',
560 '/\./',
561 '/\+/',
562 '/\*/',
563 '/\?/',
564 '/\[/',
565 '/\^/',
566 '/\]/',
567 '/\$/',
568 '/\{/',
569 '/\}/',
570 '/\=/',
571 '/\!/',
572 '/\</',
573 '/\>/',
574 '/\|/',
575 '/\:/'
578 $escaped = array(
579 '\\\\\\\\',
580 '\.',
581 '\+',
582 '\*',
583 '\?',
584 '\[',
585 '\^',
586 '\]',
587 '\$',
588 '\{',
589 '\}',
590 '\=',
591 '\!',
592 '\<',
593 '\>',
594 '\|',
595 '\:'
597 return preg_replace($chars, $escaped, $str);
600 //Setup VIM: ex: et ts=4 sw=4 enc=utf-8 :