"MDL-12304, fix double text"
[moodle-linuxchix.git] / lib / lexer.php
blobfe01255157d21bc4b095cf0b658b0b14d6384dc1
1 <?php // $Id$
3 /* PHP lexer code snarfed from the CVS tree for the lamplib project at
4 * http://sourceforge.net/projects/lamplib
5 * This project is administered by Markus Baker, Harry Fuecks and Matt
6 * Mitchell, and the project code is in the public domain.
7 *
8 * Thanks, guys!
9 */
11 define("LEXER_ENTER", 1);
12 define("LEXER_MATCHED", 2);
13 define("LEXER_UNMATCHED", 3);
14 define("LEXER_EXIT", 4);
15 define("LEXER_SPECIAL", 5);
17 /**
18 * Compounded regular expression. Any of
19 * the contained patterns could match and
20 * when one does it's label is returned.
22 class ParallelRegex {
23 var $_patterns;
24 var $_labels;
25 var $_regex;
26 var $_case;
28 /**
29 * Constructor. Starts with no patterns.
30 * @param $case True for case sensitive, false
31 * for insensitive.
32 * @public
34 function ParallelRegex($case) {
35 $this->_case = $case;
36 $this->_patterns = array();
37 $this->_labels = array();
38 $this->_regex = null;
41 /**
42 * Adds a pattern with an optional label.
43 * @param $pattern Perl style regex, but ( and )
44 * lose the usual meaning.
45 * @param $label Label of regex to be returned
46 * on a match.
47 * @public
49 function addPattern($pattern, $label = true) {
50 $count = count($this->_patterns);
51 $this->_patterns[$count] = $pattern;
52 $this->_labels[$count] = $label;
53 $this->_regex = null;
56 /**
57 * Attempts to match all patterns at once against
58 * a string.
59 * @param $subject String to match against.
60 * @param $match First matched portion of
61 * subject.
62 * @return True on success.
63 * @public
65 function match($subject, &$match) {
66 if (count($this->_patterns) == 0) {
67 return false;
69 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
70 $match = "";
71 return false;
73 $match = $matches[0];
74 for ($i = 1; $i < count($matches); $i++) {
75 if ($matches[$i]) {
76 return $this->_labels[$i - 1];
79 return true;
82 /**
83 * Compounds the patterns into a single
84 * regular expression separated with the
85 * "or" operator. Caches the regex.
86 * Will automatically escape (, ) and / tokens.
87 * @param $patterns List of patterns in order.
88 * @private
90 function _getCompoundedRegex() {
91 if ($this->_regex == null) {
92 for ($i = 0; $i < count($this->_patterns); $i++) {
93 $this->_patterns[$i] = '(' . str_replace(
94 array('/', '(', ')'),
95 array('\/', '\(', '\)'),
96 $this->_patterns[$i]) . ')';
98 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
100 return $this->_regex;
104 * Accessor for perl regex mode flags to use.
105 * @return Flags as string.
106 * @private
108 function _getPerlMatchingFlags() {
109 return ($this->_case ? "msS" : "msSi");
114 * States for a stack machine.
116 class StateStack {
117 var $_stack;
120 * Constructor. Starts in named state.
121 * @param $start Starting state name.
122 * @public
124 function StateStack($start) {
125 $this->_stack = array($start);
129 * Accessor for current state.
130 * @return State as string.
131 * @public
133 function getCurrent() {
134 return $this->_stack[count($this->_stack) - 1];
138 * Adds a state to the stack and sets it
139 * to be the current state.
140 * @param $state New state.
141 * @public
143 function enter($state) {
144 array_push($this->_stack, $state);
148 * Leaves the current state and reverts
149 * to the previous one.
150 * @return False if we drop off
151 * the bottom of the list.
152 * @public
154 function leave() {
155 if (count($this->_stack) == 1) {
156 return false;
158 array_pop($this->_stack);
159 return true;
164 * Accepts text and breaks it into tokens.
165 * Some optimisation to make the sure the
166 * content is only scanned by the PHP regex
167 * parser once. Lexer modes must not start
168 * with leading underscores.
170 class Lexer {
171 var $_regexes;
172 var $_parser;
173 var $_mode;
174 var $_mode_handlers;
175 var $_case;
178 * Sets up the lexer in case insensitive matching
179 * by default.
180 * @param $parser Handling strategy by
181 * reference.
182 * @param $start Starting handler.
183 * @param $case True for case sensitive.
184 * @public
186 function Lexer(&$parser, $start = "accept", $case = false) {
187 $this->_case = $case;
188 $this->_regexes = array();
189 $this->_parser = &$parser;
190 $this->_mode = new StateStack($start);
191 $this->_mode_handlers = array();
195 * Adds a token search pattern for a particular
196 * parsing mode. The pattern does not change the
197 * current mode.
198 * @param $pattern Perl style regex, but ( and )
199 * lose the usual meaning.
200 * @param $mode Should only apply this
201 * pattern when dealing with
202 * this type of input.
203 * @public
205 function addPattern($pattern, $mode = "accept") {
206 if (!isset($this->_regexes[$mode])) {
207 $this->_regexes[$mode] = new ParallelRegex($this->_case);
209 $this->_regexes[$mode]->addPattern($pattern);
213 * Adds a pattern that will enter a new parsing
214 * mode. Useful for entering parenthesis, strings,
215 * tags, etc.
216 * @param $pattern Perl style regex, but ( and )
217 * lose the usual meaning.
218 * @param $mode Should only apply this
219 * pattern when dealing with
220 * this type of input.
221 * @param $new_mode Change parsing to this new
222 * nested mode.
223 * @public
225 function addEntryPattern($pattern, $mode, $new_mode) {
226 if (!isset($this->_regexes[$mode])) {
227 $this->_regexes[$mode] = new ParallelRegex($this->_case);
229 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
233 * Adds a pattern that will exit the current mode
234 * and re-enter the previous one.
235 * @param $pattern Perl style regex, but ( and )
236 * lose the usual meaning.
237 * @param $mode Mode to leave.
238 * @public
240 function addExitPattern($pattern, $mode) {
241 if (!isset($this->_regexes[$mode])) {
242 $this->_regexes[$mode] = new ParallelRegex($this->_case);
244 $this->_regexes[$mode]->addPattern($pattern, "__exit");
248 * Adds a pattern that has a special mode.
249 * Acts as an entry and exit pattern in one go.
250 * @param $pattern Perl style regex, but ( and )
251 * lose the usual meaning.
252 * @param $mode Should only apply this
253 * pattern when dealing with
254 * this type of input.
255 * @param $special Use this mode for this one token.
256 * @public
258 function addSpecialPattern($pattern, $mode, $special) {
259 if (!isset($this->_regexes[$mode])) {
260 $this->_regexes[$mode] = new ParallelRegex($this->_case);
262 $this->_regexes[$mode]->addPattern($pattern, "_$special");
266 * Adds a mapping from a mode to another handler.
267 * @param $mode Mode to be remapped.
268 * @param $handler New target handler.
269 * @public
271 function mapHandler($mode, $handler) {
272 $this->_mode_handlers[$mode] = $handler;
276 * Splits the page text into tokens. Will fail
277 * if the handlers report an error or if no
278 * content is consumed. If successful then each
279 * unparsed and parsed token invokes a call to the
280 * held listener.
281 * @param $raw Raw HTML text.
282 * @return True on success, else false.
283 * @public
285 function parse($raw) {
286 if (!isset($this->_parser)) {
287 return false;
289 $length = strlen($raw);
290 while (is_array($parsed = $this->_reduce($raw))) {
291 list($unmatched, $matched, $mode) = $parsed;
292 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
293 return false;
295 if (strlen($raw) == $length) {
296 return false;
298 $length = strlen($raw);
300 if (!$parsed) {
301 return false;
303 return $this->_invokeParser($raw, LEXER_UNMATCHED);
307 * Sends the matched token and any leading unmatched
308 * text to the parser changing the lexer to a new
309 * mode if one is listed.
310 * @param $unmatched Unmatched leading portion.
311 * @param $matched Actual token match.
312 * @param $mode Mode after match. The "_exit"
313 * mode causes a stack pop. An
314 * false mode causes no change.
315 * @return False if there was any error
316 * from the parser.
317 * @private
319 function _dispatchTokens($unmatched, $matched, $mode = false) {
320 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
321 return false;
323 if ($mode === "__exit") {
324 if (!$this->_invokeParser($matched, LEXER_EXIT)) {
325 return false;
327 return $this->_mode->leave();
329 if (strncmp($mode, "_", 1) == 0) {
330 $mode = substr($mode, 1);
331 $this->_mode->enter($mode);
332 if (!$this->_invokeParser($matched, LEXER_SPECIAL)) {
333 return false;
335 return $this->_mode->leave();
337 if (is_string($mode)) {
338 $this->_mode->enter($mode);
339 return $this->_invokeParser($matched, LEXER_ENTER);
341 return $this->_invokeParser($matched, LEXER_MATCHED);
345 * Calls the parser method named after the current
346 * mode. Empty content will be ignored.
347 * @param $content Text parsed.
348 * @param $is_match Token is recognised rather
349 * than unparsed data.
350 * @private
352 function _invokeParser($content, $is_match) {
353 if (($content === "") || ($content === false)) {
354 return true;
356 $handler = $this->_mode->getCurrent();
357 if (isset($this->_mode_handlers[$handler])) {
358 $handler = $this->_mode_handlers[$handler];
360 return $this->_parser->$handler($content, $is_match);
364 * Tries to match a chunk of text and if successful
365 * removes the recognised chunk and any leading
366 * unparsed data. Empty strings will not be matched.
367 * @param $raw The subject to parse. This is the
368 * content that will be eaten.
369 * @return Three item list of unparsed
370 * content followed by the
371 * recognised token and finally the
372 * action the parser is to take.
373 * True if no match, false if there
374 * is a parsing error.
375 * @private
377 function _reduce(&$raw) {
378 if (!isset($this->_regexes[$this->_mode->getCurrent()])) {
379 return false;
381 if ($raw === "") {
382 return true;
384 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
385 $count = strpos($raw, $match);
386 $unparsed = substr($raw, 0, $count);
387 $raw = substr($raw, $count + strlen($match));
388 return array($unparsed, $match, $action);
390 return true;